def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_poca_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = poca_dummy_config() optimizer = create_test_poca_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( update_buffer[BufferKey.CONTINUOUS_ACTION]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW), use_rnn=False, use_discrete=False, use_visual=False, ) # Test update behavior_spec = optimizer.policy.behavior_spec update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not n_agents = len(update_buffer["continuous_log_probs"]) update_buffer["continuous_log_probs"] = np.ones( (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def reward_signal_update(optimizer, reward_signal_name): buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain) feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update( optimizer.policy, buffer.make_mini_batch(0, 10), 2) out = optimizer.policy._execute_model( feed_dict, optimizer.reward_signals[reward_signal_name].update_dict) assert type(out) is dict
def test_trainer_update_policy(mock_env, dummy_config, use_discrete): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock_env, use_discrete, False, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0", False) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES) # Mock out reward signal eval buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] buffer.update_buffer["extrinsic_returns"] = buffer.update_buffer["rewards"] buffer.update_buffer["extrinsic_value_estimates"] = buffer.update_buffer[ "rewards"] trainer.training_buffer = buffer trainer.update_policy() # Make batch length a larger multiple of sequence length trainer.trainer_parameters["batch_size"] = 128 trainer.update_policy() # Make batch length a larger non-multiple of sequence length trainer.trainer_parameters["batch_size"] = 100 trainer.update_policy()
def reward_signal_eval(policy, reward_signal_name): buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain) # Test evaluate rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch( buffer) assert rsig_result.scaled_reward.shape == (BATCH_SIZE, ) assert rsig_result.unscaled_reward.shape == (BATCH_SIZE, )
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16 ) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_evaluate_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) agent_action = AgentAction.from_buffer(buffer) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = policy.evaluate_actions( tensor_obs, masks=act_masks, actions=agent_action, memories=memories, seq_len=policy.sequence_length, ) if discrete: _size = policy.behavior_spec.action_spec.discrete_size else: _size = policy.behavior_spec.action_spec.continuous_size assert log_probs.flatten().shape == (64, _size) assert entropy.shape == (64,) for val in values.values(): assert val.shape == (64,)
def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) (sampled_actions, log_probs, entropies, memories) = policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length ) if discrete: assert log_probs.all_discrete_tensor.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.continuous_tensor.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64,) if rnn: assert memories.shape == (1, 1, policy.m_size)
def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) # Copy memories to critic memories copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_save_load_buffer(tmpdir, dummy_config): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock.Mock(), False, False, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["summary_path"] = str(tmpdir) trainer_params["model_path"] = str(tmpdir) trainer_params["save_replay_buffer"] = True trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0) policy = trainer.create_policy(mock_brain) trainer.add_policy(mock_brain.brain_name, policy) trainer.update_buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES) buffer_len = trainer.update_buffer.num_experiences trainer.save_model(mock_brain.brain_name) # Wipe Trainer and try to load trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0) policy = trainer2.create_policy(mock_brain) trainer2.add_policy(mock_brain.brain_name, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def test_sac_rnn_policy(dummy_config): # Test evaluate tf.reset_default_graph() policy = create_sac_policy_mock(dummy_config, use_rnn=True, use_discrete=True, use_visual=False) step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS) run_out = policy.evaluate(step, list(step.agent_id)) assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) # Test update buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] update_buffer = AgentBuffer() buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length) run_out = policy.update( update_buffer, num_sequences=update_buffer.num_experiences // policy.sequence_length, )
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = _create_ppo_optimizer_ops_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update behavior_spec = optimizer.policy.behavior_spec update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if discrete: n_agents = len(update_buffer["discrete_log_probs"]) update_buffer["discrete_log_probs"] = np.ones( (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))), dtype=np.float32, ) else: n_agents = len(update_buffer["continuous_log_probs"]) update_buffer["continuous_log_probs"] = np.ones( (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate tf.reset_default_graph() optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def test_sac_save_load_buffer(tmpdir, dummy_config): mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.hyperparameters.save_replay_buffer = True trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir") behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) policy = trainer.create_policy(behavior_id, mock_specs) trainer.add_policy(behavior_id, policy) trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.behavior_spec) buffer_len = trainer.update_buffer.num_experiences trainer.save_model() # Wipe Trainer and try to load trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir") policy = trainer2.create_policy(behavior_id, mock_specs) trainer2.add_policy(behavior_id, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def reward_signal_eval(optimizer, reward_signal_name): buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec) # Test evaluate rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch( buffer) assert rsig_result.scaled_reward.shape == (BATCH_SIZE, ) assert rsig_result.unscaled_reward.shape == (BATCH_SIZE, )
def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer( dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] update_buffer["curiosity_returns"] = update_buffer["environment_rewards"] update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_update_reward_signals( dummy_config, curiosity_dummy_config, discrete # noqa: F811 ): # Add a Curiosity module dummy_config.reward_signals = curiosity_dummy_config optimizer = create_sac_optimizer_mock(dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False) # Test update, while removing PPO-specific buffer elements. update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] update_buffer[RewardSignalUtil.rewards_key("curiosity")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] return_stats = optimizer.update_reward_signals( {"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences) required_stats = [ "Losses/Curiosity Forward Loss", "Losses/Curiosity Inverse Loss" ] for stat in required_stats: assert stat in return_stats.keys()
def create_bc_trainer(dummy_config, is_discrete=False): mock_env = mock.Mock() if is_discrete: mock_brain = mb.create_mock_pushblock_brain() mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=70) else: mock_brain = mb.create_mock_3dball_brain() mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config trainer_parameters["summary_path"] = "tmp" trainer_parameters["model_path"] = "tmp" trainer_parameters["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/test.demo") trainer = BCTrainer(mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0) trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100) return trainer, env
def test_ppo_optimizer_update_curiosity( dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate dummy_config.reward_signals = curiosity_dummy_config optimizer = create_test_ppo_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["curiosity_returns"] = update_buffer["environment_rewards"] update_buffer["curiosity_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete): torch.manual_seed(0) # Test evaluate optimizer = create_sac_optimizer_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=12) # Mock out reward signal eval update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[ BufferKey.ENVIRONMENT_REWARDS] # Mock out value memories update_buffer[BufferKey.CRITIC_MEMORY] = update_buffer[BufferKey.MEMORY] return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Losses/Q1 Loss", "Losses/Q2 Loss", "Policy/Continuous Entropy Coeff", "Policy/Discrete Entropy Coeff", "Policy/Learning Rate", ] for stat in required_stats: assert stat in return_stats.keys()
def test_trainer_update_policy(dummy_config, use_discrete): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True # Test curiosity reward signal trainer_params["reward_signals"]["curiosity"] = {} trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0 trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99 trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128 trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete): # Test evaluate optimizer = create_test_ppo_optimizer(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=optimizer.policy.m_size, ) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Policy/Learning Rate", "Policy/Epsilon", "Policy/Beta", ] for stat in required_stats: assert stat in return_stats.keys()
def test_sac_save_load_buffer(tmpdir, dummy_config): mock_brain = mb.setup_mock_brain( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.hyperparameters.save_replay_buffer = True trainer = SACTrainer( mock_brain.brain_name, 1, trainer_params, True, False, 0, "testdir" ) policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain) buffer_len = trainer.update_buffer.num_experiences trainer.save_model(mock_brain.brain_name) # Wipe Trainer and try to load trainer2 = SACTrainer( mock_brain.brain_name, 1, trainer_params, True, True, 0, "testdir" ) policy = trainer2.create_policy(mock_brain.brain_name, mock_brain) trainer2.add_policy(mock_brain.brain_name, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def test_ppo_optimizer_update_curiosity( curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811 ): # Test evaluate tf.reset_default_graph() dummy_config["reward_signals"].update(curiosity_dummy_config) optimizer = _create_ppo_optimizer_ops_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["curiosity_returns"] = update_buffer["environment_rewards"] update_buffer["curiosity_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete): torch.manual_seed(0) # Test evaluate optimizer = create_sac_optimizer_mock(dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=24) # Mock out reward signal eval update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"] return_stats = optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Make sure we have the right stats required_stats = [ "Losses/Policy Loss", "Losses/Value Loss", "Losses/Q1 Loss", "Losses/Q2 Loss", "Policy/Entropy Coeff", "Policy/Learning Rate", ] for stat in required_stats: assert stat in return_stats.keys()
def reward_signal_update(env, policy, reward_signal_name): buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) feed_dict = policy.reward_signals[reward_signal_name].prepare_update( policy.model, buffer.update_buffer.make_mini_batch(0, 10), 2) out = policy._execute_model( feed_dict, policy.reward_signals[reward_signal_name].update_dict) assert type(out) is dict
def test_sac_update_reward_signals(mock_env, dummy_config, discrete): # Test evaluate tf.reset_default_graph() # Add a Curiosity module dummy_config["reward_signals"]["curiosity"] = {} dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0 dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99 dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128 env, policy = create_sac_policy_mock(mock_env, dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False) # Test update, while removing PPO-specific buffer elements. update_buffer = mb.simulate_rollout( env, policy, BUFFER_INIT_SAMPLES, exclude_key_list=["advantages", "actions_pre"]) # Mock out reward signal eval update_buffer["extrinsic_rewards"] = update_buffer["rewards"] update_buffer["curiosity_rewards"] = update_buffer["rewards"] policy.update_reward_signals({"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences) env.close()
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH) optimizer = create_test_ppo_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = ppo_dummy_config() optimizer = create_test_ppo_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["continuous_log_probs"] = np.ones_like( update_buffer["continuous_action"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )