예제 #1
0
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.batch_size = batch_size
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def choose_action(self, observation):
        raise NotImplementedError

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min
    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
예제 #2
0
파일: agents.py 프로젝트: arame/707_DRQN
class Agent():
    def __init__(self, input_dims, n_actions):
        self.epsilon = Config.epsilon
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(input_dims, n_actions)

    def store_transition(self, state, action, reward, state_new, done):
        self.memory.store_transition(state, action, reward, state_new, done)

    def choose_action(self, observation):
        raise NotImplementedError

    def replace_target_network(self):
        if self.learn_step_counter % Config.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon * Config.eps_decay, Config.eps_min)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer()

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def learn(self):
        raise NotImplementedError

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
예제 #3
0
class DQN():
    def __init__(self, states, actions, alpha, gamma, epsilon, epsilon_min,
                 epsilon_decay, replay_buffer_sz, batch, path, path_pred):
        self.Q = Network(states.shape, actions, alpha, path)
        self.Q_pred = Network(states.shape, actions, alpha, path_pred)

        # self.memory = deque(maxlen=replay_buffer_sz)
        self.memory = ReplayBuffer(replay_buffer_sz, states.shape, actions)
        self.batch = batch
        self.learn_cnt = 0

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.actions = actions
        self.Q.path = path
        self.Q_pred.path = path_pred

    def e_greedy_policy(self, s):
        p = random.random()
        s = torch.tensor([s], dtype=torch.float).to(self.Q.device)
        # s = torch.unsqueeze(axis=0)
        a = torch.argmax(self.Q.forward(s)).item() if (
            p > self.epsilon) else np.random.randint(0, self.actions)
        return a

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch)

        states = torch.tensor(state).to(self.Q.device)
        rewards = torch.tensor(reward).to(self.Q.device)
        dones = torch.tensor(done).to(self.Q.device)
        actions = torch.tensor(action).to(self.Q.device)
        states_ = torch.tensor(new_state).to(self.Q.device)

        return states, actions, rewards, states_, dones

    def store(self, s, a, r, ns, done):
        # self.memory.append([s,a,r,ns,done])
        self.memory.store_transition(s, a, r, ns, done)

    def update_target_network(self):
        self.Q_pred.load_state_dict(self.Q.state_dict())

    def save_models(self):
        self.Q.save_checkpoint(self.Q.path)
        self.Q_pred.save_checkpoint(self.Q_pred.path)

    def load_models(self):
        self.Q.load_checkpoint()
        self.Q_pred.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch:
            return

        self.Q.optimizer.zero_grad()

        if (self.learn_cnt >= 1000):  #only update network after 1000 steps
            self.learn_cnt = 0
            self.update_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch)

        q_pred = self.Q.forward(states)[indices, actions]
        q_next = self.Q_pred.forward(states_).max(dim=1)[0]

        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.learn_cnt += 1

        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
예제 #4
0
class DDQNAgent(object):

    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, action_joint_dim,
                 mem_size, batch_size, eps_min, eps_dec, replace,
                 prioritized=False, prob_alpha=0.6, beta=0.4, beta_increment=1e-4, 
                 temperature = 0.1, tau = 1e-5):

        """

        Double Deep Q-Learning Agent class.

        -----

        Args:
            gamma: Discount factor for reward. 0 indicates a myopic behaviour. 1 indicates a far-sighted behaviour.
            epsilon: Exploration/exploitation rate. 0 indicates full exploitation.
            lr: Learning Rate. The bigger 'lr' the bigger step in the gradient of the loss.
            n_actions: Number of possible actions.
            input_dims: Dimension of the state (allegedly an image). The channel goes first (CHANN, HEIGHT, WIDTH)
            action_joint_dim: Number of joints for the Multi-agent case. Normally the number of agents.
            mem_size: Number of the Replay Buffer memory.
            batch_size: Number of past experiences used for trainin Q-Network.
            eps_min: Min. value for the exploration.
            eps_dec: Epsilon decay in every epoch.
            replace: Number of epochs for replacing the target network with the behavioral network.

        ------
        """
        
        # Hiperparámetros de entrenamiento #
        self.gamma = gamma
        self.epsilon = epsilon
        self.beta = beta
        self.beta_increment = beta_increment
        self.prob_alpha = prob_alpha
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.update_target_count = replace
        self.action_space = [i for i in range(n_actions)]
        self.action_joint_dim = action_joint_dim
        self.prioritized = prioritized
        self.temperature = temperature
        self.tau = tau
        self.mem_size = mem_size

        if not self.prioritized:
            self.memory = ReplayBuffer(mem_size, input_dims, action_joint_dim)
        else:
            self.memory = PrioritizedReplayBuffer(mem_size, input_dims, action_joint_dim, self.prob_alpha)
        
        # Funciones del modelo y del target #
        
        self.q_eval = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims)
        self.q_eval.cuda()
        
        self.q_next = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims)
        self.q_next.cuda()

    def reset_memory(self):
        self.memory = ReplayBuffer(self.mem_size, self.input_dims, self.action_joint_dim)

    def store_transition(self, state, action, reward, next_state, done):
        """

        Store a '(s,a,r,s')' transition into the buffer replay.

        ------

        Args:
            state: State of the experience.
            action: Action joint performed in the given 'state'.
            reward: 1D Reward array obtained due to (s,a). One component for every agent.
            next_state: The next state produced, given (s,a).
            done: If the state is terminal. Normally 0 because non-episodic.

        ------
        """
        
        # Guardamos en memoria la tupla (s,a,r,s') + done #
        # Se guardan como arrays de numpy y al samplear se pasan a tensores #
        self.memory.store_transition(state, action, reward, next_state, done)        

    def sample_memory(self):

        """

        Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay.

        ------

        Returns: The *BATCH_SIZE* (s,a,r,s') experiences.

        ------
        """
        
        # Muestreamos un conjunto BATCH de experiencias #
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

        with T.no_grad():
            # Los convertimos a tensores de Pytorch #
            states = T.tensor(state, device = self.q_eval.device)
            rewards = T.tensor(reward, device = self.q_eval.device)
            dones = T.tensor(done, device = self.q_eval.device)
            actions = T.tensor(action, device = self.q_eval.device)
            next_state = T.tensor(new_state, device = self.q_eval.device)

        return states, actions, rewards, next_state, dones

    def prioritized_sample_memory(self):

        """

                Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay.

                ------

                Returns: The *BATCH_SIZE* (s,a,r,s') experiences.

                ------
        """

        # Muestreamos un conjunto BATCH de experiencias #
        state, action, reward, new_state, done, indices, weight = self.memory.sample_buffer(self.batch_size, self.beta)

        with T.no_grad():
            # Los convertimos a tensores de Pytorch #
            states = T.tensor(state, device = self.q_eval.device)
            rewards = T.tensor(reward, device = self.q_eval.device)
            dones = T.tensor(done, device = self.q_eval.device)
            actions = T.tensor(action, device = self.q_eval.device)
            next_state = T.tensor(new_state, device = self.q_eval.device)
            weights = T.tensor(weight, device = self.q_eval.device)

        return states, actions, rewards, next_state, dones, indices, weights

    # Política e-greedy #
    def choose_action(self, observation, mode = 'egreedy'):

        """

        Epsilon-greedy policy. Plays explorate/explotate with a probability epsilon/(1-apsilon).

        ----

        Args:
            observation: The state (allegedly an image). Must be a matrix with (N_CHANN, HEIGHT, WIDTH)

        Returns: An action joint (1D array) with the selected actions.

        ------
        """
        
        if mode == 'egreedy':
            
            if np.random.random() > self.epsilon:
                
                with T.no_grad():
                    
                    state = T.tensor([observation], dtype=T.float, device = self.q_eval.device)
                    Q = self.q_eval.forward(state)
                    action_joint = []
                    
                    # En la e-greedy, si caemos en explotación, a = max_a(Q)
                    for i in range(self.action_joint_dim):
                        action_joint.append(T.argmax(Q.narrow(1,i*self.n_actions,self.n_actions)).item())
    
                return action_joint
            
            else:
                action_joint = np.random.choice(self.action_space, size = self.action_joint_dim)
                return action_joint
        
        elif mode == 'softmax':
            
            with T.no_grad():
                
                    state = T.tensor([observation], dtype=T.float, device = self.q_eval.device)
                    Q = self.q_eval.forward(state)
                    action_joint = []
                    
                    # Softmax policy #
                    for i in range(self.action_joint_dim):
                        probs = T.softmax(Q.narrow(1,i*self.n_actions,self.n_actions)/self.temperature,dim=1)
                        categ = T.distributions.Categorical(probs)
                        action_joint.append(categ.sample().item())
    
                    return action_joint
            
        else:
            assert('ERROR. ESCOJA UN VALOR DE POLÍTICA ADECUADO: (egreedy/softmax)')
    
    # Este método se usará out-class para poder alternar entre DDQN y DQN #
    def replace_target_network(self, epoch):
        """

        Function to dump the behavioral network into the target network.

        -----

        Args:
            epoch: The actual epoch.

        ------
        """
        """
        if epoch % self.update_target_count == 0:
            
            for target_param, param in zip(self.q_next.parameters(), self.q_eval.parameters()):
                
                target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
        """
        
        if epoch % self.update_target_count == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())
        
        
    def learn(self, mask = None):
        """

        Learning function. It predicts the return (target_network) and takes a descent gradient step. The q-values are
        calculated with Time Difference and we will accumulate the other_results of the gradients along the agents.

        ------
        """

        # Si intentamos entrenar con menos experiencias que tamaño de batch
        # nos salimos y no entrenamos.
        if self.memory.mem_cntr < self.batch_size:
            return
        
        if mask is None:
            mask = np.zeros(shape=self.action_joint_dim)

        self.q_eval.optimizer.zero_grad()
        self.q_next.optimizer.zero_grad()

        if not self.prioritized:
            states, actions, rewards, next_states, dones = self.sample_memory()
        else:
            states, actions, rewards, next_states, dones, batches, weights = self.prioritized_sample_memory()
            prior = T.tensor(np.zeros(shape=batches.shape), device = self.q_eval.device)

        indices = np.arange(self.batch_size)
        
        Q_pred = self.q_eval(states)
        Q_next = self.q_next(next_states)
        Q_eval = self.q_eval(next_states)

        for i in range(self.action_joint_dim):
            
            if mask[i] == 1: # If the mask is 1, the agent does not learn at all #
                continue
            
            q_pred = Q_pred.narrow(1,i*self.n_actions,self.n_actions)[indices, actions[:, i]]
            max_actions = T.argmax(Q_eval.narrow(1,i*self.n_actions,self.n_actions), dim=1).detach()

            q_target = rewards[indices, i] + self.gamma*Q_next.narrow(1,i*self.n_actions,self.n_actions)[indices, max_actions[i]].detach() # TARGET IS DETACHED, ITS PARAMETERS ARE NOT SUBJECT OF TRAINING #
            
            if not self.prioritized:
                loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
                loss.backward(retain_graph=True)

            else:
                loss = self.q_eval.loss2(q_target, q_pred).to(self.q_eval.device)*weights
                prior += loss.data.detach()
                loss = loss.mean()
                loss.backward(retain_graph=True)

        self.q_eval.optimizer.step()

        if self.prioritized:
            self.memory.update_priorities(batches, prior.cpu().numpy())
        
    def decrement_epsilon(self):
        """
        Decrement 'self.epsilon' with a 'self.eps_dec', clipping its minimum to 'self.eps_min'.

        ------
        """
        
        if self.epsilon >= self.eps_min:
            self.epsilon = self.epsilon - self.eps_dec
        else:
            self.epsilon = self.eps_min

    def increment_beta(self):

        if self.beta >= 1:
            self.beta = 1
        else:
            self.beta += self.beta_increment
            
    def decrement_temperature(self):
        
        if self.temperature < 0.005:
            self.temperature = 0.005
        else:
            self.temperature = self.temperature - 2e-4
예제 #5
0
class Agent(object):
    '''
    Agent base class
    '''
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        # agents memory
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

    def choose_action(
        self, observation
    ):  # Example: epsilon-greedy behavior policy for action selection
        raise NotImplementedError

    def store_transition(self, state, action, reward, state_, done):
        '''
           Storing transitions in the agent's memory, sampling those transitions and converting imput into tensors, 
           and decay epsilon, and replacing target network.
        '''
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon = self.epsilon - self.eps_dec
        else:
            self.epsilon = self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        raise NotImplementedError
예제 #6
0
class DuelingDDQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='models/'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DuelingDeepQNetwork(self.lr,
                                          self.n_actions,
                                          input_dims=self.input_dims,
                                          name=self.env_name + '_' +
                                          self.algo + '_q_eval',
                                          chkpt_dir=self.chkpt_dir)
        self.q_next = DuelingDeepQNetwork(self.lr,
                                          self.n_actions,
                                          input_dims=self.input_dims,
                                          name=self.env_name + '_' +
                                          self.algo + '_q_next',
                                          chkpt_dir=self.chkpt_dir)

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = np.array([observation], copy=False, dtype=np.float32)
            state_tensor = T.tensor(state).to(self.q_eval.device)
            _, advantages = self.q_eval.forward(state_tensor)

            action = T.argmax(advantages).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if self.replace_target_cnt is not None and \
           self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        V_s, A_s = self.q_eval.forward(states)
        V_s_, A_s_ = self.q_next.forward(states_)

        V_s_eval, A_s_eval = self.q_eval.forward(states_)

        q_pred = T.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True)))[indices,
                                                                   actions]

        q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True)))

        q_eval = T.add(V_s_eval,
                       (A_s_eval - A_s_eval.mean(dim=1, keepdim=True)))

        max_actions = T.argmax(q_eval, dim=1)
        q_next[dones] = 0.0

        q_target = rewards + self.gamma * q_next[indices, max_actions]

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
예제 #7
0
class Agent(object):
    def __init__(self, n_actions, input_dims):
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.epsilon = Config.epsilon
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(input_dims, n_actions)
        name_root = Config.env_name + '_' + Config.algo
        self.q_eval = Network(self.n_actions,
                              input_dims=self.input_dims,
                              name=name_root + '_q_eval')
        self.q_next = Network(self.n_actions,
                              input_dims=self.input_dims,
                              name=name_root + '_q_next')

    def store_transition(self, state, action, reward, state_new, done):
        self.memory.store_transition(state, action, reward, state_new, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            Config.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = np.array([observation], copy=False, dtype=np.float32)
            state_tensor = T.tensor(state).to(self.q_eval.device)
            _, advantages = self.q_eval.forward(state_tensor)

            action = T.argmax(advantages).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if Config.replace_target_cnt is not None and self.learn_step_counter % Config.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decay_epsilon(self):
        new_value = self.epsilon * Config.eps_decay
        if new_value > Config.eps_min:
            self.epsilon = new_value

    def learn(self):
        if self.memory.mem_cntr < Config.batch_size:
            return

        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()

        states, actions, rewards, states_new, dones = self.sample_memory()
        indices = np.arange(Config.batch_size)

        Value_stream, Advantage_stream = self.q_eval.forward(states)
        Value_stream_new, Advantage_stream_new = self.q_next.forward(
            states_new)
        Value_stream_eval, Advantage_stream_eval = self.q_eval.forward(
            states_new)

        q_pred = T.add(Value_stream,
                       (Advantage_stream -
                        Advantage_stream.mean(dim=1, keepdim=True)))[indices,
                                                                     actions]
        q_next = T.add(Value_stream_new,
                       (Advantage_stream_new -
                        Advantage_stream_new.mean(dim=1, keepdim=True)))
        q_eval = T.add(Value_stream_eval,
                       (Advantage_stream_eval -
                        Advantage_stream_eval.mean(dim=1, keepdim=True)))

        max_actions = T.argmax(q_eval, dim=1)
        q_next[dones] = 0.0
        q_target = rewards + Config.gamma * q_next[indices, max_actions]

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        self.decay_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
예제 #8
0
class DQNAgent():
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 chkpt_dir,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None):
        self.gamma = gamma  # 0.99
        self.epsilon = epsilon  # 1.0
        self.lr = lr  # 0.0001
        self.n_actions = n_actions  # 6
        self.input_dims = input_dims  # (4, 84, 84)
        self.batch_size = batch_size  # 32
        self.eps_min = eps_min  # 0.1
        self.eps_dec = eps_dec  # 1e-05
        self.replace_target_cnt = replace  # 1000
        self.algo = algo  # 'DQNAgent'
        self.env_name = env_name  #  'PongNoFrameskip-v4'
        self.chkpt_dir = chkpt_dir  #  .\\models\\
        self.action_space = [i for i in range(self.n_actions)
                             ]  # [0, 1, 2, 3, 4, 5]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict(
            ))  # load_state_dict and state_dict are inbuilt functions of torch

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()

        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(
            states
        )[indices,
          actions]  # self.q_eval.forward(states).shape = (32, 6), q_pred.shape = 32
        q_next = self.q_next.forward(states_).max(
            dim=1
        )[0]  # self.q_next.forward(states_).shape = (32, 6), q_next.shape = 32

        temp_dones = dones.bool()
        q_next[temp_dones] = 0.0  # as reward for terminal state is 0
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        self.decrement_epsilon()
예제 #9
0
class Agent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=.01, eps_dec=5e-7,
                 replace_count=1000, algorithm=None, env_name=None, checkpoint_dir='/checkpoints'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_count = replace_count
        self.algorithm = algorithm
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        print(type(self).__name__)
        self.q_eval = object
        self.q_policy = object

    def store_transition(self, current_state, action, reward, next_state, done):
        self.memory.store_transition(current_state, action, reward, next_state, done)

    def sample_memory(self):
        current_state, action, reward, next_state, done = self.memory.sample_buffer(self.batch_size)
        current_states = T.tensor(current_state).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        next_states = T.tensor(next_state).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)

        return current_states, actions, rewards, next_states, dones

    def update_policy_network(self):
        if(self.learn_step_counter % self.replace_count == 0):
            self.q_policy.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_policy.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_policy.load_checkpoint()

    def pre_learn(self):
        if self.memory.mem_idx < self.batch_size:
            return
        self.q_eval.optimizer.zero_grad()
        self.update_policy_network()
        current_states, actions, rewards, next_states, dones = self.sample_memory()
        indices = np.arange(self.batch_size)
        return current_states, actions, rewards, next_states, dones, indices

    def post_learn(self, q_target, q_pred):
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

    def choose_action(self, observation, network):
        raise NotImplementedError

    def learn(self):
        raise NotImplementedError
예제 #10
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

        #self.dim_bechir = self.q_eval.calculate_output_bechir(self.input_dims)

    def choose_action(self, observation):
        """
        Choose an action through an epsilon-greedy approach.
        :param observation: state features as provided by gym environment.
        :return: action
        """
        if np.random.random() > self.epsilon:
            # Convert state to Pytorch tensor and send to q_eval.device
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            # Get actions values from q_eval network
            actions = self.q_eval.forward(state)
            # Get action with highest value
            action = T.argmax(actions).item()
        else:
            # Select random action from action space
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        # If memory counter has not reached batch size simply return
        if self.memory.mem_cntr < self.batch_size:
            return

        # reset gradients for the main network's optimizer
        self.q_eval.optimizer.zero_grad()

        # Call function to update target network weights every n steps
        self.replace_target_network()

        # Sample environment transitions from the replay buffer
        states, actions, rewards, states_, dones = self.sample_memory()

        # Get Q(s,a) for the actions performed by the agent.
        # Because we processed a batch of states, we need to index the result of the forward function by the indices of
        # the states (from 0 to batch_size) followed by the index of the action performed by the agent.
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]

        # Get max Q(s', a') from target network
        q_next = self.q_next.forward(states_).max(dim=1)[0]

        # Set Q(s', a') to zero for terminal states
        q_next[dones] = 0.0
        # Compute the q_target as r + gamma * Q(s',a')
        q_target = rewards + self.gamma * q_next

        # Compute the loss tensor and move it to q_eval.device
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)

        # Backpropagate loss and optimize network parameters
        loss.backward()
        self.q_eval.optimizer.step()

        # Increment training counter
        self.learn_step_counter += 1

        # Decrement epsilon for epsilon-greedy action selection
        self.decrement_epsilon()
예제 #11
0
class DDQNAgent(object):
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size,
                 batch_size, chkpt_name, eps_min, eps_dec, replace,
                 logging_dir):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace

        self.chkpt_dir = chkpt_name
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = FullyConnectedNet(self.lr,
                                        self.n_actions,
                                        input_dims=self.input_dims,
                                        chkpt_name=self.chkpt_dir,
                                        name='q_eval',
                                        logging_dir=logging_dir)

        self.q_next = FullyConnectedNet(self.lr,
                                        self.n_actions,
                                        input_dims=self.input_dims,
                                        name='q_next',
                                        chkpt_name=self.chkpt_dir,
                                        logging_dir=logging_dir)

    def select_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self, freeze):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()
        #self.q_next.load_state_dict(self.q_eval.state_dict())

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_)
        q_eval = self.q_eval.forward(states_)

        max_actions = T.argmax(q_eval, dim=1)
        q_next[dones] = 0.0

        q_target = rewards + self.gamma * q_next[indices, max_actions]
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()

        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()

        if freeze:

            #self.q_eval.conv1.weight.requires_grad = False
            #self.q_eval.conv2.weight.requires_grad = False
            #self.q_eval.conv3.weight.requires_grad = False
            #print('conv nets are frozen')
            pass
예제 #12
0
class DDQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DDQN(self.lr,
                           self.n_actions,
                           input_dims=self.input_dims,
                           name=self.env_name + '_' + self.algo + '_q_eval',
                           chkpt_dir=self.chkpt_dir)

        self.q_next = DDQN(self.lr,
                           self.n_actions,
                           input_dims=self.input_dims,
                           name=self.env_name + '_' + self.algo + '_q_next',
                           chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if tf.random.uniform([1]) > self.epsilon:

            actions = self.q_eval.call(tf.expand_dims(observation, axis=0))

            action = tf.argmax(actions, axis=1)

        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        states, actions, rewards, new_states, dones = self.memory.sample_buffer(
            self.batch_size)

        return states, actions, rewards, new_states, dones

    def replace_target_network(self):
        if self.learn_step_counter & self.replace_target_cnt == 0:
            self.q_next = self.q_eval

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        optimizer = keras.optimizers.RMSprop(learning_rate=self.lr)
        indices = tf.range(self.batch_size)

        with tf.GradientTape() as tape:
            #predicted q from actions taken from the memory
            q_pred = tf.gather_nd(self.q_eval.call(states),
                                  list(zip(indices, actions)))

            #forward pass of future states trough policy network
            q_next = self.q_next.call(states_)

            #forward pass of future states trough target network
            q_eval = self.q_eval.call(states_)

            #gains the corresponding actions/indices that have the higest q values from target
            max_actions = tf.math.argmax(q_eval, axis=1)

            #zero out the terminal states
            q_next = q_next * tf.expand_dims(tf.cast(dones, tf.float32), -1)

            #get q values of the max action predicted from targets using the policy network
            gather = tf.gather_nd(
                q_next, list(zip(indices, tf.cast(max_actions,
                                                  dtype=tf.int32))))

            #q value update
            q_target = rewards + self.gamma * tf.cast(gather, dtype=tf.float32)

            #loss calculation
            loss = keras.losses.MSE(q_target, q_pred)
        gradient = tape.gradient(loss, self.q_eval.trainable_variables)
        optimizer.apply_gradients(
            zip(gradient, self.q_eval.trainable_variables))
        self.decrement_epsilon()
        self.learn_step_counter += 1
예제 #13
0
class SACAgent():
    def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \
                    n_actions=2, max_size=1000000, layer1_size=400, \
                    layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name='_actor',
                                  max_action=max_action,
                                  chkpt_dir=path_dir)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_1',
                                      chkpt_dir=path_dir)
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name='_critic_2',
                                      chkpt_dir=path_dir)
        self.value = ValueNetwork(beta,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  name='_value',
                                  chkpt_dir=path_dir)
        self.target_value = ValueNetwork(beta,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         name='_target_value',
                                         chkpt_dir=path_dir)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau * value_state_dict[name].clone() + (
                1 - tau) * target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done).to(self.critic_1.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()

    def save_models(self, episode):
        self.actor.save_checkpoint(episode)
        self.value.save_checkpoint(episode)
        self.target_value.save_checkpoint(episode)
        self.critic_1.save_checkpoint(episode)
        self.critic_2.save_checkpoint(episode)

    def load_models(self, episode):
        self.actor.load_checkpoint(episode)
        self.value.load_checkpoint(episode)
        self.target_value.load_checkpoint(episode)
        self.critic_1.load_checkpoint(episode)
        self.critic_2.load_checkpoint(episode)
예제 #14
0
class DQNAgent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec = 5e-7,
                replace =1000, algo=None, env_name = None, chkpt_dir = 'tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter= 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_eval', chkpt_dir= self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_next', chkpt_dir= self.chkpt_dir)

    def choose_action(self, observation):
        if(np.random.random()> self.epsilon):
            state = T.tensor([observation], dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards= T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, done
    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())
    def decrement_epsilon(self):
        self.epsilon = self.epsilon -self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()
    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
    def learn(self):
        if(self.memory.mem_cntr < self.batch_size):
            return
        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[dones] = 0
        q_target= rewards + self.gamma * q_next
        loss =self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter+=1
        self.decrement_epsilon()
예제 #15
0
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 lr,
                 mem_size,
                 batch_size,
                 epsilon,
                 gamma=0.99,
                 eps_dec=5e-7,
                 eps_min=0.01,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.lr = lr
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.replace = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation], dtype=T.float).to(
                self.q_eval.device)  # converting observation to tensor,
            # and observation is in the list because our convolution expects an input tensor of shape batch size
            # by input dims.
            q_values = self.q_eval.forward(state)
            action = T.argmax(q_values).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, resulted_state, done):
        self.memory.store_transition(state, action, reward, resulted_state,
                                     done)

    def sample_memory(self):
        state, action, reward, resulted_state, done = self.memory.sample_buffer(
            self.batch_size)
        state = T.tensor(state).to(self.q_eval.device)
        reward = T.tensor(reward).to(self.q_eval.device)
        done = T.tensor(done).to(self.q_eval.device)
        action = T.tensor(action).to(self.q_eval.device)
        resulted_state = T.tensor(resulted_state).to(self.q_eval.device)

        return state, reward, done, action, resulted_state

    def replace_target_network(self):
        if self.learn_step_counter % self.replace == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon -= self.eps_dec
        else:
            self.epsilon = self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        state, reward, done, action, resulted_state = self.sample_memory()

        indexes = np.arange(self.batch_size, dtype=np.longlong)
        action = action.long()
        done = done.bool()

        prediction = self.q_eval.forward(state)[
            indexes, action]  # dims: batch_size * n_actions

        next_result = self.q_next.forward(resulted_state).max(dim=1)[0]
        next_result[done] = 0.0  # for terminal states, target should be reward
        target = reward + self.gamma * next_result

        loss = self.q_eval.loss(target, prediction).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.decrement_epsilon()
예제 #16
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DQN(self.lr,
                          self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + self.algo + '_q_eval',
                          chkpt_dir=self.chkpt_dir)

        self.q_next = DQN(self.lr,
                          self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + self.algo + '_q_next',
                          chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if tf.random.uniform([1]) > self.epsilon:

            actions = self.q_eval.call(tf.expand_dims(observation, axis=0))

            action = tf.argmax(actions, axis=1)

        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        states, actions, rewards, new_states, dones = self.memory.sample_buffer(
            self.batch_size)

        return states, actions, rewards, new_states, dones

    def replace_target_network(self):
        if self.learn_step_counter & self.replace_target_cnt == 0:
            self.q_next = self.q_eval

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        optimizer = keras.optimizers.RMSprop(learning_rate=self.lr)
        indices = tf.range(self.batch_size)
        with tf.GradientTape() as tape:

            q_pred = tf.gather_nd(self.q_eval.call(states),
                                  list(zip(indices, actions)))
            q_next = tf.math.reduce_max(self.q_next.call(states_), axis=1)
            q_next = q_next * tf.expand_dims(tf.cast(dones, tf.float32), -1)
            q_target = rewards + self.gamma * q_next
            loss = keras.losses.MSE(q_target, q_pred)
        gradient = tape.gradient(loss, self.q_eval.trainable_variables)
        optimizer.apply_gradients(
            zip(gradient, self.q_eval.trainable_variables))
        self.decrement_epsilon()
        self.learn_step_counter += 1
예제 #17
0
class Agent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_counter = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)

    def store_transition(self, state, action, reward, resulted_state, done):
        self.memory.store_transition(state, action, reward, resulted_state,
                                     done)

    def sample_memory(self):
        state, action, reward, resulted_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        resulted_states = T.tensor(resulted_state).to(self.q_eval.device)

        return states, actions, rewards, resulted_states, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation], dtype=T.float).to(
                self.q_eval.device)  # converting observation to tensor,
            # and observation is in the list because our convolution expects an input tensor of shape batch size
            # by input dims.
            _, advantages = self.q_eval.forward(state)
            action = T.argmax(advantages).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if self.replace_target_counter is not None and \
            self.learn_step_counter % self.replace_target_counter == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon = self.epsilon - self.eps_dec
        else:
            self.epsilon = self.eps_min

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, resulted_states, dones = self.sample_memory()

        indexes = np.arange(self.batch_size)

        V_states, A_states = self.q_eval.forward(states)
        q_pred = T.add(
            V_states, (A_states - A_states.mean(dim=1, keepdim=True)))[indexes,
                                                                       actions]

        V_resulted_states, A_resulted_states = self.q_next.forward(
            resulted_states)
        q_next = T.add(
            V_resulted_states,
            (A_resulted_states -
             A_resulted_states.mean(dim=1, keepdim=True))).max(dim=1)[0]
        q_next[dones] = 0.0

        target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.decrement_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()