torch.tensor(np.array([done])).to(device).float())) state = next_state episode_rewards += reward if done: break rewards.append(episode_rewards) # Train the model if memory is sufficient if len(memory) + len(episode_transitions) >= min_buffer: for i in range(train_steps): loss = optimize(model, target, memory, episode_transitions, optimizer) losses.append(loss.item()) memory.extend(episode_transitions) # Update target network every once in a while if episode % target_update == 0: target.load_state_dict(model.state_dict()) target.eval() if episode % print_interval == 0 and episode > 0: print("[Episode {}] | avg rewards : {:.3f} | s.d. rewards: {:.3f} | avg loss : {:.10f} | buffer size : {} | epsilon : {:.1f}%".format( episode, np.mean(rewards), np.std(rewards), np.mean(losses), len(memory), epsilon*100)) rewards = [] losses = [] if episode % SAVE_INTERVAL == 0 and episode > 0: torch.save(model.state_dict(), SAVE_PATH)
episode_rewards += reward if done: break rewards.append(episode_rewards) # Train the model if memory is sufficient if len(memory_success) >= min_buffer and len( memory_failure) >= min_buffer: for i in range(train_steps): loss = optimize(model, target, memory_success, memory_failure, episode_transitions, optimizer) losses.append(loss.item()) # Update target network every once in a while if episode % target_update == 0: target.load_state_dict(model.state_dict()) target.eval() if episode_rewards > 0: memory_success.extend(episode_transitions) else: memory_failure.extend(episode_transitions) if episode % print_interval == 0 and episode > 0: print( "[Episode {}] | avg rewards : {:.3f} | s.d. rewards: {:.3f} | avg loss : {:.10f} | succ. buffer : {} | fail. buffer : {}" .format(episode, np.mean(rewards), np.std(rewards), np.mean(losses), len(memory_success), len(memory_failure))) rewards = [] losses = []