Exemplo n.º 1
0
    def step(self, states, actions, rewards, next_states, dones, running_timestep):
    
        # Store experience to the replay buffer
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            # print('adding: ', state.shape, action.shape, reward, next_state.shape, done)
            self.memory.add(state, action, reward, next_state, done)
    
        # When the memory is at-least full as the batch size and if the step num is a factor of UPDATE_AFTER_STEP
        # then we learn the parameters of the network
        # Update the weights of local network and soft-update the weighs of the target_network
        # self.t_step = (self.t_step + 1) % self.UPDATE_AFTER_STEP  # Run from {1->UPDATE_AFTER_STEP}
        # print('[Step] Current Step is: ', self.tstep)
        if (running_timestep % self.LEARNING_FREQUENCY) == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, self.GAMMA, running_timestep)

        if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING:
            if self.IS_HARD_UPDATE:
                if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0:
                    utils.hard_update(self.actor_local, self.actor_target)
    
            elif self.IS_SOFT_UPDATE:
                if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0:
                    utils.soft_update(self.critic_local, self.critic_target, self.TAU)
                    utils.soft_update(self.actor_local, self.actor_target, self.TAU)
            else:
                raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
Exemplo n.º 2
0
 def update(self, running_time_step):
     
     if self.IS_HARD_UPDATE:
         if (running_time_step % self.HARD_UPDATE_FREQUENCY) == 0:
             utils.hard_update(self.actor_local, self.actor_target)
     
     elif self.IS_SOFT_UPDATE:
         if (running_time_step % self.SOFT_UPDATE_FREQUENCY) == 0:
             utils.soft_update(self.critic_local, self.critic_target, self.TAU)
             utils.soft_update(self.actor_local, self.actor_target, self.TAU)
     else:
         raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
Exemplo n.º 3
0
    def optimize(self):
        """
        Samples a random batch from replay memory and performs optimization
        :return:
        """
        for i in range(self.num_agents):

            s, a, r, ns = self.memories[i].sample(self.batch_size)
            reward_predict = []
            pre_acts = []
            for j in range(len(ns)):
                state = ns[j]
                reward = torch.max(self.target_actors[i](torch.from_numpy(
                    np.array([state
                              ])).to(self.device))).to('cpu').data.numpy()
                # print(reward)
                reward_predict.append(reward)

            reward_predict = np.array(reward_predict)
            pre_acts = np.array(pre_acts, dtype=np.float32)

            s = Variable(torch.from_numpy(s).to(self.device))
            a = Variable(torch.from_numpy(a).to(self.device))
            r = Variable(torch.from_numpy(r).to(self.device))
            ns = Variable(torch.from_numpy(ns).to(self.device))
            pre_acts = Variable(torch.from_numpy(pre_acts).to(self.device))
            reward_predict = torch.squeeze(
                torch.from_numpy(reward_predict).to(self.device))
            ''' ---------------------- optimize ----------------------
            Use target actor exploitation policy here for loss evaluation
            y_exp = r + gamma*Q'( s2, pi'(s2))
            y_pred = Q( s1, a1)
            '''
            y_expected = r + self.gamma * reward_predict
            y_predicted = torch.squeeze(self.actors[i].forward(s))
            # print(y_predicted)

            y_predicted = torch.amax(y_predicted, dim=-1)
            # print(y_expected)
            # print(y_predicted)
            ''' compute critic loss, and update the critic '''
            loss_actor = F.mse_loss(y_predicted, y_expected)
            # print(y_expected)
            self.actor_optimizers[i].zero_grad()
            loss_actor.backward()
            self.actor_optimizers[i].step()

            utils.soft_update(self.target_actors[i], self.actors[i], self.tau)
            self.actor_loss_value = loss_actor.to('cpu').data.numpy()
            # for param in self.actors[i].parameters():
            #     print(param.data)
        self.iter += 1
Exemplo n.º 4
0
    def step(self, states, actions, rewards, next_states, dones,
             running_timestep):
        # print('Taking a step: ', state.shape, action, reward, next_state.shape, done, episode_num, running_time_step)
        # Insert the tuple into the memory buffer
        self.memory.add(states, actions, rewards, next_states, dones)

        if (running_timestep % self.LEARNING_FREQUENCY) == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, running_timestep)

        if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING:
            if self.IS_HARD_UPDATE:
                if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0:
                    utils.hard_update(self.local_network, self.target_network)

            elif self.IS_SOFT_UPDATE:
                if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0:
                    utils.soft_update(self.local_network, self.target_network,
                                      self.TAU)
            else:
                raise ValueError(
                    'Only One of HARD_UPDATE and SOFT_UPDATE is to be activated'
                )