示例#1
0
    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
        policy = NNPolicy(
            self.seed,
            brain_parameters,
            self.trainer_parameters,
            self.is_training,
            self.load,
            tanh_squash=True,
            reparameterize=True,
            create_tf_graph=False,
        )
        for _reward_signal in policy.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

        # Load the replay buffer if load
        if self.load and self.checkpoint_replay_buffer:
            try:
                self.load_replay_buffer()
            except (AttributeError, FileNotFoundError):
                logger.warning(
                    "Replay buffer was unable to load, starting from scratch.")
            logger.debug("Loaded update buffer with {} sequences".format(
                self.update_buffer.num_experiences))

        return policy
示例#2
0
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_parameters = trainer_config
    model_path = "testpath"
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["reward_signals"].update(reward_signal_config)
    trainer_parameters["use_recurrent"] = use_rnn
    policy = NNPolicy(0,
                      mock_brain,
                      trainer_parameters,
                      False,
                      False,
                      create_tf_graph=False)
    if trainer_parameters["trainer"] == "sac":
        optimizer = SACOptimizer(policy, trainer_parameters)
    else:
        optimizer = PPOOptimizer(policy, trainer_parameters)
    return optimizer
示例#3
0
def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config["model_path"] = "testpath"
    trainer_config["keep_checkpoints"] = 3
    trainer_config["use_recurrent"] = use_rnn
    trainer_config["behavioral_cloning"]["demo_path"] = (
        os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file
    )

    policy = NNPolicy(
        0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
    )
    with policy.graph.as_default():
        bc_module = BCModule(
            policy,
            policy_learning_rate=trainer_config["learning_rate"],
            default_batch_size=trainer_config["batch_size"],
            default_num_epoch=3,
            **trainer_config["behavioral_cloning"],
        )
    policy.initialize_or_load()  # Normally the optimizer calls this after the BCModule is created
    return bc_module
示例#4
0
def test_ppo_get_value_estimates(mock_communicator, mock_launcher,
                                 dummy_config):
    tf.reset_default_graph()

    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    policy = NNPolicy(0,
                      brain_params,
                      dummy_config,
                      False,
                      False,
                      create_tf_graph=False)
    optimizer = PPOOptimizer(policy, dummy_config)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert len(val) == 15

    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=True)
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    optimizer.reward_signals["extrinsic"].use_terminal_states = False
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False)
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val != 0.0
示例#5
0
def create_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_parameters = dummy_config
    model_path = "testmodel"
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["use_recurrent"] = use_rnn
    policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
    return policy
示例#6
0
    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
        """
        Creates a PPO policy to trainers list of policies.
        :param brain_parameters: specifications for policy construction
        :return policy
        """
        policy = NNPolicy(
            self.seed,
            brain_parameters,
            self.trainer_parameters,
            self.is_training,
            self.load,
            condition_sigma_on_obs=False,  # Faster training for PPO
            create_tf_graph=
            False,  # We will create the TF graph in the Optimizer
        )

        return policy
示例#7
0
def test_normalization(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"

    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 6
    assert mean[0] == 0.5
    # Note: variance is divided by number of steps, and initialized to 1 to avoid
    # divide by 0. The right answer is 0.25
    assert (variance[0] - 1) / steps == 0.25

    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 16
    assert mean[0] == 0.8125
    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)