示例#1
0
            print('loss Player2 %s' % all_losses2[-1])
            print("---------------------------")
            
            timer.update(time.time())            
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)           
            print("######## {0} ########".format(sys.argv[1]))
        rollout1.after_update() # player1
        rollout2.after_update() # player2

        if i_update % 1000 == 0 and i_update > 0:
            logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable))    
            logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
            logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
            logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
            swich_variable += 1
            swich_variable %= 2

    logger.log(all_rewards1, "Data/", "all_rewards_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
    logger.log(all_losses1, "Data/", "all_losses_p1_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
    logger.log_state_dict(agent1.state_dict(), "Data/agents/agent1_{0}_{1}".format(sys.argv[1], swich_variable))    
    logger.log(all_rewards2, "Data/", "all_rewards_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))  
    logger.log(all_losses2, "Data/", "all_losses_p2_{0}_{1}.txt".format(sys.argv[1], swich_variable))      
    logger.log_state_dict(agent2.state_dict(), "Data/agents/agent2_{0}_{1}".format(sys.argv[1], swich_variable))
        value_loss = advantages.pow(2).mean()        
        action_loss = -(advantages.data * action_log_probs).mean()

        optimizer.zero_grad()        
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        
        loss.backward()
        nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
        optimizer.step()
    
        if i_update % 100 == 0:            
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())
            timer.update(time.time())
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            
            print('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:])))            
            print('loss %s' % all_losses[-1])
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)
            print("######## AC_Pacman_{0} ########".format(mode))                        
        rollout.after_update()
        
    logger.log(all_rewards, "Data/", "all_rewards_{0}.txt".format(mode))  
    logger.log(all_losses, "Data/", "all_losses_{0}.txt".format(mode))      
    logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_{0}".format(mode))    

    

示例#3
0
            print("---------------------------")
            
            timer.update(time.time())            
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)                       
            print("######## AC_KeyCollect ########")
        
        rollout.after_update()
        
        # snapshot of weights, data and optimzer every 1000 epochs
        if i_update % 1000 == 0 and i_update > 0:
            logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt")            
            logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt")                        
            logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect")
            logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect")            

    # final save        
    logger.log(all_rewards, "Data/", "all_rewards_KeyCollect.txt")    
    logger.log(all_losses, "Data/", "all_losses_KeyCollect.txt")        
    logger.log_state_dict(actor_critic.state_dict(), "Data/actor_critic_KeyCollect")
    logger.log_state_dict(optimizer.state_dict(), "Data/actor_critic_optimizer_KeyCollect")            




            print('loss %s' % all_losses[-1])
            print("---------------------------")

            timer.update(time.time())
            timediff = timer.getTimeDiff()
            total_time = timer.getTotalTime()
            loopstogo = (num_frames - i_update) / 100
            estimatedtimetogo = timer.getTimeToGo(loopstogo)
            logger.printDayFormat("runntime last epochs: ", timediff)
            logger.printDayFormat("total runtime: ", total_time)
            logger.printDayFormat("estimated time to run: ", estimatedtimetogo)
            print("######## {0} ########".format(sys.argv[1]))
        rollout.after_update()

        if i_update % 5000 == 0 and i_update > 0:
            logger.log(all_rewards, "Data/",
                       "all_rewards_{0}_{1}.txt".format(sys.argv[1], mode))
            logger.log(all_losses, "Data/",
                       "all_losses_{0}_{1}.txt".format(sys.argv[1], mode))
            logger.log_state_dict(
                actor_critic.state_dict(),
                "Data/agents/actor_critic_{0}_{1}".format(sys.argv[1], mode))

    logger.log(all_rewards, "Data/",
               "all_rewards_{0}_{1}.txt".format(sys.argv[1], mode))
    logger.log(all_losses, "Data/",
               "all_losses_{0}_{1}.txt".format(sys.argv[1], mode))
    logger.log_state_dict(
        actor_critic.state_dict(),
        "Data/agents/actor_critic_{0}_{1}".format(sys.argv[1], mode))