print('loss Player2 %s' % all_losses2[-1]) print("---------------------------") timer.update(time.time()) timediff = timer.getTimeDiff() total_time = timer.getTotalTime() loopstogo = (num_frames - i_update) / 100 estimatedtimetogo = timer.getTimeToGo(loopstogo) logger.printDayFormat("runntime last epochs: ", timediff) logger.printDayFormat("total runtime: ", total_time) logger.printDayFormat("estimated time to run: ", estimatedtimetogo) print("######## {0} ########".format(sys.argv[1])) rollout1.after_update() # player1 rollout2.after_update() # player2 if i_update % 1000 == 0 and i_update > 0: logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable)) logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable)) swich_variable += 1 swich_variable %= 2 logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable)) logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable)) logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
value_loss = advantages.pow(2).mean() action_loss = -(advantages.data * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm) optimizer.step() if i_update % 100 == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) timer.update(time.time()) loopstogo = (num_frames - i_update) / 100 estimatedtimetogo = timer.getTimeToGo(loopstogo) print('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) print('loss %s' % all_losses[-1]) logger.printDayFormat("estimated time to run: ", estimatedtimetogo) print("######## AC_Pacman_{0} ########".format(mode)) rollout.after_update() logger.log(all_rewards, "Data/", "all_rewards_{0}.txt".format(mode)) logger.log(all_losses, "Data/", "all_losses_{0}.txt".format(mode)) logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_{0}".format(mode))
print("---------------------------") timer.update(time.time()) timediff = timer.getTimeDiff() total_time = timer.getTotalTime() loopstogo = (num_frames - i_update) / 100 estimatedtimetogo = timer.getTimeToGo(loopstogo) logger.printDayFormat("runntime last epochs: ", timediff) logger.printDayFormat("total runtime: ", total_time) logger.printDayFormat("estimated time to run: ", estimatedtimetogo) print("######## AC_KeyCollect ########") rollout.after_update() # snapshot of weights, data and optimzer every 1000 epochs if i_update % 1000 == 0 and i_update > 0: logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt") logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt") logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect") logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect") # final save logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt") logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt") logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect") logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect")
print('loss %s' % all_losses[-1]) print("---------------------------") timer.update(time.time()) timediff = timer.getTimeDiff() total_time = timer.getTotalTime() loopstogo = (num_frames - i_update) / 100 estimatedtimetogo = timer.getTimeToGo(loopstogo) logger.printDayFormat("runntime last epochs: ", timediff) logger.printDayFormat("total runtime: ", total_time) logger.printDayFormat("estimated time to run: ", estimatedtimetogo) print("######## {0} ########".format(sys.argv[1])) rollout.after_update() if i_update % 5000 == 0 and i_update > 0: logger.log(all_rewards, "Data/", "all_rewards_{0}_{1}.txt".format(sys.argv[1], mode)) logger.log(all_losses, "Data/", "all_losses_{0}_{1}.txt".format(sys.argv[1], mode)) logger.log_state_dict( actor_critic.state_dict(), "Data/agents/actor_critic_{0}_{1}".format(sys.argv[1], mode)) logger.log(all_rewards, "Data/", "all_rewards_{0}_{1}.txt".format(sys.argv[1], mode)) logger.log(all_losses, "Data/", "all_losses_{0}_{1}.txt".format(sys.argv[1], mode)) logger.log_state_dict( actor_critic.state_dict(), "Data/agents/actor_critic_{0}_{1}".format(sys.argv[1], mode))