u_value = critic(cur_state) u_value_list = torch.cat([u_value_list, u_value]) # Update parameters of critic by TD(0) # TODO : Use TD Lambda here and compare the performance # target = reward + gamma * critic(next_state) # Using 1-done even in the target for actor since the next state wont have any meaning when done=1 # TODO : Remove this line if 1-done is a wrong concept in actor target = reward + gamma * (1 - done) * critic_old(next_state) target_list = torch.cat([target_list, target]) replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample_pytorch(sample_size=32) # update the critic's q approximation using the sampled transitions running_loss1_mean += update_critic(critic_old, **sample_transitions) # this section was for actor experience replay, which to my dismay performed much worse than without replay # actor_replay_buffer.add(target, u_value, -log_prob) # sample_objectives = actor_replay_buffer.sample(sample_size=32) # actor_optimizer.zero_grad() # # compute the gradient from the sampled log probability # # the log probability times the Q of the action that you just took in that state # """Important note""" # # Reward scaling, this performs much better. # # In the general case this might not be a good idea. If there are rare events with extremely high rewards # # that only occur in some episodes, and the majority of episodes only experience common events with # # lower-scale rewards, then this trick will mess up training. In cartpole environment this is not of concern # # since all the rewards are 1 itself
if done: if episode_timestep <= 170: reward = -500 else: reward = 50 else: reward = 20 u_value = critic(cur_state) target = reward + gamma * (1 - done) * critic(next_state) replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample_pytorch() # update the critic's q approximation using the sampled transitions running_loss1_mean += update_critic(**sample_transitions) target_list = torch.cat([target_list, target]) u_value_list = torch.cat([u_value_list, u_value]) log_prob_list = torch.cat([log_prob_list, log_prob.reshape(-1)]) episode_reward += reward episode_timestep += 1 cur_state = next_state # Update parameters of actor by policy gradient actor_optimizer.zero_grad() # compute the gradient from the sampled log probability # the log probability times the Q of the action that you just took in that state