示例#1
0
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
示例#2
0
def test_poca_optimizer_update_gail(gail_dummy_config,
                                    dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = poca_dummy_config()
    optimizer = create_test_poca_optimizer(config,
                                           use_rnn=False,
                                           use_discrete=False,
                                           use_visual=False)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    copy_buffer_fields(
        update_buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.baseline_estimates_key("extrinsic"),
            RewardSignalUtil.returns_key("gail"),
            RewardSignalUtil.value_estimates_key("gail"),
            RewardSignalUtil.baseline_estimates_key("gail"),
        ],
    )

    update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like(
        update_buffer[BufferKey.CONTINUOUS_ACTION])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    copy_buffer_fields(
        update_buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.baseline_estimates_key("extrinsic"),
            RewardSignalUtil.returns_key("gail"),
            RewardSignalUtil.value_estimates_key("gail"),
            RewardSignalUtil.baseline_estimates_key("gail"),
        ],
    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#3
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
        use_rnn=False,
        use_discrete=False,
        use_visual=False,
    )
    # Test update
    behavior_spec = optimizer.policy.behavior_spec
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    n_agents = len(update_buffer["continuous_log_probs"])
    update_buffer["continuous_log_probs"] = np.ones(
        (n_agents, behavior_spec.action_spec.continuous_size),
        dtype=np.float32)
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#4
0
def reward_signal_update(optimizer, reward_signal_name):
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
    feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
        optimizer.policy, buffer.make_mini_batch(0, 10), 2)
    out = optimizer.policy._execute_model(
        feed_dict, optimizer.reward_signals[reward_signal_name].update_dict)
    assert type(out) is dict
def test_trainer_update_policy(mock_env, dummy_config, use_discrete):
    env, mock_brain, _ = mb.setup_mock_env_and_brains(
        mock_env,
        use_discrete,
        False,
        num_agents=NUM_AGENTS,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params["use_recurrent"] = True

    trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0",
                         False)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
    # Mock out reward signal eval
    buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
    buffer.update_buffer["extrinsic_returns"] = buffer.update_buffer["rewards"]
    buffer.update_buffer["extrinsic_value_estimates"] = buffer.update_buffer[
        "rewards"]
    trainer.training_buffer = buffer
    trainer.update_policy()
    # Make batch length a larger multiple of sequence length
    trainer.trainer_parameters["batch_size"] = 128
    trainer.update_policy()
    # Make batch length a larger non-multiple of sequence length
    trainer.trainer_parameters["batch_size"] = 100
    trainer.update_policy()
def reward_signal_eval(policy, reward_signal_name):
    buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain)
    # Test evaluate
    rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch(
        buffer)
    assert rsig_result.scaled_reward.shape == (BATCH_SIZE, )
    assert rsig_result.unscaled_reward.shape == (BATCH_SIZE, )
示例#7
0
def test_trainer_update_policy(
    dummy_config, curiosity_dummy_config, use_discrete  # noqa: F811
):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16
    )

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
def test_evaluate_actions(rnn, visual, discrete):
    policy = create_policy_mock(
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
    agent_action = AgentAction.from_buffer(buffer)
    np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs))
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    memories = [
        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    log_probs, entropy, values = policy.evaluate_actions(
        tensor_obs,
        masks=act_masks,
        actions=agent_action,
        memories=memories,
        seq_len=policy.sequence_length,
    )
    if discrete:
        _size = policy.behavior_spec.action_spec.discrete_size
    else:
        _size = policy.behavior_spec.action_spec.continuous_size

    assert log_probs.flatten().shape == (64, _size)
    assert entropy.shape == (64,)
    for val in values.values():
        assert val.shape == (64,)
def test_sample_actions(rnn, visual, discrete):
    policy = create_policy_mock(
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])

    np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs))
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    memories = [
        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    (sampled_actions, log_probs, entropies, memories) = policy.sample_actions(
        tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length
    )
    if discrete:
        assert log_probs.all_discrete_tensor.shape == (
            64,
            sum(policy.behavior_spec.action_spec.discrete_branches),
        )
    else:
        assert log_probs.continuous_tensor.shape == (
            64,
            policy.behavior_spec.action_spec.continuous_size,
        )
    assert entropies.shape == (64,)

    if rnn:
        assert memories.shape == (1, 1, policy.m_size)
示例#10
0
def test_ppo_optimizer_update_curiosity(
    dummy_config, curiosity_dummy_config, rnn, visual, discrete  # noqa: F811
):
    # Test evaluate
    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_test_ppo_optimizer(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    copy_buffer_fields(
        update_buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.returns_key("curiosity"),
            RewardSignalUtil.value_estimates_key("curiosity"),
        ],
    )
    # Copy memories to critic memories
    copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY])

    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
示例#11
0
def test_sac_save_load_buffer(tmpdir, dummy_config):
    env, mock_brain, _ = mb.setup_mock_env_and_brains(
        mock.Mock(),
        False,
        False,
        num_agents=NUM_AGENTS,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_params = dummy_config
    trainer_params["summary_path"] = str(tmpdir)
    trainer_params["model_path"] = str(tmpdir)
    trainer_params["save_replay_buffer"] = True
    trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False,
                         0, 0)
    policy = trainer.create_policy(mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)

    trainer.update_buffer = mb.simulate_rollout(env, trainer.policy,
                                                BUFFER_INIT_SAMPLES)
    buffer_len = trainer.update_buffer.num_experiences
    trainer.save_model(mock_brain.brain_name)

    # Wipe Trainer and try to load
    trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True,
                          0, 0)

    policy = trainer2.create_policy(mock_brain)
    trainer2.add_policy(mock_brain.brain_name, policy)
    assert trainer2.update_buffer.num_experiences == buffer_len
示例#12
0
def test_sac_rnn_policy(dummy_config):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_sac_policy_mock(dummy_config,
                                    use_rnn=True,
                                    use_discrete=True,
                                    use_visual=False)
    step = mb.create_batchedstep_from_brainparams(policy.brain,
                                                  num_agents=NUM_AGENTS)
    run_out = policy.evaluate(step, list(step.agent_id))
    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))

    # Test update
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                 policy.brain,
                                 memory_size=8)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    update_buffer = AgentBuffer()
    buffer.resequence_and_append(update_buffer,
                                 training_length=policy.sequence_length)
    run_out = policy.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // policy.sequence_length,
    )
示例#13
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = _create_ppo_optimizer_ops_mock(dummy_config,
                                               use_rnn=rnn,
                                               use_discrete=discrete,
                                               use_visual=visual)
    # Test update
    behavior_spec = optimizer.policy.behavior_spec
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    if discrete:
        n_agents = len(update_buffer["discrete_log_probs"])
        update_buffer["discrete_log_probs"] = np.ones(
            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
            dtype=np.float32,
        )
    else:
        n_agents = len(update_buffer["continuous_log_probs"])
        update_buffer["continuous_log_probs"] = np.ones(
            (n_agents, behavior_spec.action_spec.continuous_size),
            dtype=np.float32)

    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#14
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = create_test_ppo_optimizer(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]

    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Policy/Learning Rate",
        "Policy/Epsilon",
        "Policy/Beta",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
示例#15
0
def test_sac_save_load_buffer(tmpdir, dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )
    trainer_params = dummy_config
    trainer_params.hyperparameters.save_replay_buffer = True
    trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir")
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    policy = trainer.create_policy(behavior_id, mock_specs)
    trainer.add_policy(behavior_id, policy)

    trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                                policy.behavior_spec)
    buffer_len = trainer.update_buffer.num_experiences
    trainer.save_model()

    # Wipe Trainer and try to load
    trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir")

    policy = trainer2.create_policy(behavior_id, mock_specs)
    trainer2.add_policy(behavior_id, policy)
    assert trainer2.update_buffer.num_experiences == buffer_len
示例#16
0
def reward_signal_eval(optimizer, reward_signal_name):
    buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec)
    # Test evaluate
    rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(
        buffer)
    assert rsig_result.scaled_reward.shape == (BATCH_SIZE, )
    assert rsig_result.unscaled_reward.shape == (BATCH_SIZE, )
示例#17
0
def test_ppo_optimizer_update_curiosity(
    dummy_config, curiosity_dummy_config, rnn, visual, discrete  # noqa: F811
):
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_test_ppo_optimizer(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
示例#18
0
def test_sac_update_reward_signals(
        dummy_config,
        curiosity_dummy_config,
        discrete  # noqa: F811
):
    # Add a Curiosity module
    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_sac_optimizer_mock(dummy_config,
                                          use_rnn=False,
                                          use_discrete=discrete,
                                          use_visual=False)

    # Test update, while removing PPO-specific buffer elements.
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)

    # Mock out reward signal eval
    update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
        BufferKey.ENVIRONMENT_REWARDS]
    update_buffer[RewardSignalUtil.rewards_key("curiosity")] = update_buffer[
        BufferKey.ENVIRONMENT_REWARDS]
    return_stats = optimizer.update_reward_signals(
        {"curiosity": update_buffer},
        num_sequences=update_buffer.num_experiences)
    required_stats = [
        "Losses/Curiosity Forward Loss", "Losses/Curiosity Inverse Loss"
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
def create_bc_trainer(dummy_config, is_discrete=False):
    mock_env = mock.Mock()
    if is_discrete:
        mock_brain = mb.create_mock_pushblock_brain()
        mock_braininfo = mb.create_mock_braininfo(num_agents=12,
                                                  num_vector_observations=70)
    else:
        mock_brain = mb.create_mock_3dball_brain()
        mock_braininfo = mb.create_mock_braininfo(num_agents=12,
                                                  num_vector_observations=8)
    mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
    env = mock_env()

    trainer_parameters = dummy_config
    trainer_parameters["summary_path"] = "tmp"
    trainer_parameters["model_path"] = "tmp"
    trainer_parameters["demo_path"] = (
        os.path.dirname(os.path.abspath(__file__)) + "/test.demo")
    trainer = BCTrainer(mock_brain,
                        trainer_parameters,
                        training=True,
                        load=False,
                        seed=0,
                        run_id=0)
    trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy,
                                                       100)
    return trainer, env
示例#20
0
def test_ppo_optimizer_update_curiosity(
        dummy_config,
        curiosity_dummy_config,
        rnn,
        visual,
        discrete  # noqa: F811
):
    # Test evaluate
    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_test_ppo_optimizer(dummy_config,
                                          use_rnn=rnn,
                                          use_discrete=discrete,
                                          use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#21
0
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
    torch.manual_seed(0)
    # Test evaluate
    optimizer = create_sac_optimizer_mock(dummy_config,
                                          use_rnn=rnn,
                                          use_discrete=discrete,
                                          use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec,
                                        memory_size=12)
    # Mock out reward signal eval
    update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
        BufferKey.ENVIRONMENT_REWARDS]
    # Mock out value memories
    update_buffer[BufferKey.CRITIC_MEMORY] = update_buffer[BufferKey.MEMORY]
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Losses/Q1 Loss",
        "Losses/Q2 Loss",
        "Policy/Continuous Entropy Coeff",
        "Policy/Discrete Entropy Coeff",
        "Policy/Learning Rate",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
示例#22
0
def test_trainer_update_policy(dummy_config, use_discrete):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params["use_recurrent"] = True

    # Test curiosity reward signal
    trainer_params["reward_signals"]["curiosity"] = {}
    trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128

    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False,
                         0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
示例#23
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    optimizer = create_test_ppo_optimizer(dummy_config,
                                          use_rnn=rnn,
                                          use_discrete=discrete,
                                          use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]

    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Policy/Learning Rate",
        "Policy/Epsilon",
        "Policy/Beta",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
示例#24
0
def test_sac_save_load_buffer(tmpdir, dummy_config):
    mock_brain = mb.setup_mock_brain(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_params = dummy_config
    trainer_params.hyperparameters.save_replay_buffer = True
    trainer = SACTrainer(
        mock_brain.brain_name, 1, trainer_params, True, False, 0, "testdir"
    )
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)

    trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
    buffer_len = trainer.update_buffer.num_experiences
    trainer.save_model(mock_brain.brain_name)

    # Wipe Trainer and try to load
    trainer2 = SACTrainer(
        mock_brain.brain_name, 1, trainer_params, True, True, 0, "testdir"
    )

    policy = trainer2.create_policy(mock_brain.brain_name, mock_brain)
    trainer2.add_policy(mock_brain.brain_name, policy)
    assert trainer2.update_buffer.num_experiences == buffer_len
示例#25
0
def test_ppo_optimizer_update_curiosity(
        curiosity_dummy_config,
        dummy_config,
        rnn,
        visual,
        discrete  # noqa: F811
):
    # Test evaluate
    tf.reset_default_graph()
    dummy_config["reward_signals"].update(curiosity_dummy_config)
    optimizer = _create_ppo_optimizer_ops_mock(dummy_config,
                                               use_rnn=rnn,
                                               use_discrete=discrete,
                                               use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.brain)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#26
0
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
    torch.manual_seed(0)
    # Test evaluate
    optimizer = create_sac_optimizer_mock(dummy_config,
                                          use_rnn=rnn,
                                          use_discrete=discrete,
                                          use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec,
                                        memory_size=24)
    # Mock out reward signal eval
    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Losses/Q1 Loss",
        "Losses/Q2 Loss",
        "Policy/Entropy Coeff",
        "Policy/Learning Rate",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
示例#27
0
def reward_signal_update(env, policy, reward_signal_name):
    buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
    feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
        policy.model, buffer.update_buffer.make_mini_batch(0, 10), 2)
    out = policy._execute_model(
        feed_dict, policy.reward_signals[reward_signal_name].update_dict)
    assert type(out) is dict
示例#28
0
def test_sac_update_reward_signals(mock_env, dummy_config, discrete):
    # Test evaluate
    tf.reset_default_graph()
    # Add a Curiosity module
    dummy_config["reward_signals"]["curiosity"] = {}
    dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
    dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
    dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
    env, policy = create_sac_policy_mock(mock_env,
                                         dummy_config,
                                         use_rnn=False,
                                         use_discrete=discrete,
                                         use_visual=False)

    # Test update, while removing PPO-specific buffer elements.
    update_buffer = mb.simulate_rollout(
        env,
        policy,
        BUFFER_INIT_SAMPLES,
        exclude_key_list=["advantages", "actions_pre"])

    # Mock out reward signal eval
    update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
    update_buffer["curiosity_rewards"] = update_buffer["rewards"]
    policy.update_reward_signals({"curiosity": update_buffer},
                                 num_sequences=update_buffer.num_experiences)
    env.close()
示例#29
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
    optimizer = create_test_ppo_optimizer(config,
                                          use_rnn=False,
                                          use_discrete=False,
                                          use_visual=False)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
示例#30
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = ppo_dummy_config()
    optimizer = create_test_ppo_optimizer(config,
                                          use_rnn=False,
                                          use_discrete=False,
                                          use_visual=False)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["continuous_log_probs"] = np.ones_like(
        update_buffer["continuous_action"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )