예제 #1
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    vec_vis_obs, masks = policy1._split_decision_step(decision_step)
    vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
    vis_obs = [
        torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
    ]
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)

    with torch.no_grad():
        _, log_probs1, _, _, _ = policy1.sample_actions(vec_obs,
                                                        vis_obs,
                                                        masks=masks,
                                                        memories=memories,
                                                        all_log_probs=True)
        _, log_probs2, _, _, _ = policy2.sample_actions(vec_obs,
                                                        vis_obs,
                                                        masks=masks,
                                                        memories=memories,
                                                        all_log_probs=True)

    np.testing.assert_array_equal(log_probs1, log_probs2)
예제 #2
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    np_obs = decision_step.obs
    masks = policy1._extract_masks(decision_step)
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    with torch.no_grad():
        _, log_probs1, _, _ = policy1.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
        _, log_probs2, _, _ = policy2.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
    np.testing.assert_array_equal(log_probs1.all_discrete_tensor,
                                  log_probs2.all_discrete_tensor)
예제 #3
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=12) if use_rnn else None)
    policy = TorchPolicy(0, mock_brain, trainer_settings)
    optimizer = TorchSACOptimizer(policy, trainer_settings)
    return optimizer
예제 #4
0
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPPOOptimizer(policy, trainer_settings)
    return optimizer
예제 #5
0
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config = TrainerSettings()
    trainer_config.network_settings.memory = (NetworkSettings.MemorySettings()
                                              if use_rnn else None)
    policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample,
                         tanhresample)
    bc_module = BCModule(
        policy,
        settings=bc_settings,
        policy_learning_rate=trainer_config.hyperparameters.learning_rate,
        default_batch_size=trainer_config.hyperparameters.batch_size,
        default_num_epoch=3,
    )
    return bc_module
예제 #6
0
 def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                         behavior_spec: BehaviorSpec) -> TorchPolicy:
     """
     Creates a policy with a PyTorch backend and PPO hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :return policy
     """
     policy = TorchPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         condition_sigma_on_obs=False,  # Faster training for PPO
         separate_critic=behavior_spec.is_action_continuous(),
     )
     return policy
예제 #7
0
 def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                         behavior_spec: BehaviorSpec) -> TorchPolicy:
     """
     Creates a policy with a PyTorch backend and POCA hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :return policy
     """
     policy = TorchPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         condition_sigma_on_obs=False,  # Faster training for POCA
         separate_critic=True,  # Match network architecture with TF
     )
     return policy
예제 #8
0
파일: trainer.py 프로젝트: terite/HexChess
 def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                         behavior_spec: BehaviorSpec) -> TorchPolicy:
     """
     Creates a policy with a PyTorch backend and SAC hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :return policy
     """
     policy = TorchPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         condition_sigma_on_obs=True,
         tanh_squash=True,
         separate_critic=True,
     )
     self.maybe_load_replay_buffer()
     return policy
예제 #9
0
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    seed: int = 0,
) -> TorchPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None)
    policy = TorchPolicy(seed, mock_spec, trainer_settings)
    return policy
예제 #10
0
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete,
                               use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.reward_signals = {
        RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0,
                                                         gamma=0.99)
    }

    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=8, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPOCAOptimizer(policy, trainer_settings)
    return optimizer