Exemplo n.º 1
0
    def test_rollout_dict_space(self):
        register_env("nested", lambda _: NestedDictEnv())
        agent = PGTrainer(env="nested", config={"framework": "tf"})
        agent.train()
        path = agent.save()
        agent.stop()

        # Test train works on restore
        agent2 = PGTrainer(env="nested", config={"framework": "tf"})
        agent2.restore(path)
        agent2.train()

        # Test rollout works on restore
        rollout(agent2, "nested", 100)
Exemplo n.º 2
0
    def testRolloutDictSpace(self):
        register_env("nested", lambda _: NestedDictEnv())
        agent = PGTrainer(env="nested")
        agent.train()
        path = agent.save()
        agent.stop()

        # Test train works on restore
        agent2 = PGTrainer(env="nested")
        agent2.restore(path)
        agent2.train()

        # Test rollout works on restore
        rollout(agent2, "nested", 100)
    len_moving_average = np.convolve(episode_len_mean,
                                     np.ones((20, )) / 20,
                                     mode='valid')
    reward_moving_average = np.convolve(episode_reward_mean,
                                        np.ones((20, )) / 20,
                                        mode='valid')
    print('Current     ::: Len:: Mean: ' + str(episode_len_mean[-1]) +
          '; Reward:: Mean: ' + str(episode_reward_mean[-1]) + ', Max: ' +
          str(episode_reward_max[-1]) + ', Min: ' +
          str(episode_reward_min[-1]))
    print('mAverage20  ::: Len:: Mean: ' +
          str(np.round(len_moving_average[-1], 1)) + '; Reward:: Mean: ' +
          str(np.round(reward_moving_average[-1], 1)))

    if result['training_iteration'] % 50 == 0:
        checkpoint = PG_trainer.save()
        print("checkpoint saved at", checkpoint)
        output = {
            'episode_len_mean': episode_len_mean,
            'episode_reward_mean': episode_reward_mean,
            'episode_reward_max': episode_reward_max,
            'episode_reward_min': episode_reward_min,
            'num_steps_trained': num_steps_trained,
            'clock_time': clock_time,
            'training_iteration': training_iteration,
            'len_moving_average': len_moving_average,
            'reward_moving_average': reward_moving_average
        }
        output_path = PG_trainer._logdir + '/_running_results.pkl'
        with open(output_path, 'wb') as handle:
            pickle.dump(output, handle, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 4
0
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # "multiagent": {
            #     # "grouping":
            #     #     grouping,
            #     "policies": {
            #         # the first tuple value is None -> uses default policy
            #         "function_1": (None, obs_space_1, action_space_1, {}),
            #         "function_2": (None, obs_space_2, action_space_2, {})
            #     },
            #     "policy_mapping_fn":
            #         # tune.function(lambda agent_id: "agent_{}".format(agent_id+1)),
            #         tune.function(lambda agent_id: "function_1" if agent_id == "group_1" else "function_2"),
            # },
        })

    # Attempt to restore from checkpoint if possible.
    # if os.path.exists(CHECKPOINT_FILE):
    #     checkpoint_path = open(CHECKPOINT_FILE).read()
    #     print("Restoring from checkpoint path", checkpoint_path)
    #     dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)