def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) vec_vis_obs, masks = policy1._split_decision_step(decision_step) vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] vis_obs = [ torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations ] memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) with torch.no_grad(): _, log_probs1, _, _, _ = policy1.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) _, log_probs2, _, _, _ = policy2.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) np.testing.assert_array_equal(log_probs1, log_probs2)
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) np_obs = decision_step.obs masks = policy1._extract_masks(decision_step) memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] with torch.no_grad(): _, log_probs1, _, _ = policy1.sample_actions(tensor_obs, masks=masks, memories=memories) _, log_probs2, _, _ = policy2.sample_actions(tensor_obs, masks=masks, memories=memories) np.testing.assert_array_equal(log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor)
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=12) if use_rnn else None) policy = TorchPolicy(0, mock_brain, trainer_settings) optimizer = TorchSACOptimizer(policy, trainer_settings) return optimizer
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPPOOptimizer(policy, trainer_settings) return optimizer
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample, tanhresample) bc_module = BCModule( policy, settings=bc_settings, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, ) return bc_module
def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec) -> TorchPolicy: """ Creates a policy with a PyTorch backend and PPO hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :return policy """ policy = TorchPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO separate_critic=behavior_spec.is_action_continuous(), ) return policy
def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec) -> TorchPolicy: """ Creates a policy with a PyTorch backend and POCA hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :return policy """ policy = TorchPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for POCA separate_critic=True, # Match network architecture with TF ) return policy
def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec) -> TorchPolicy: """ Creates a policy with a PyTorch backend and SAC hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :return policy """ policy = TorchPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=True, tanh_squash=True, separate_critic=True, ) self.maybe_load_replay_buffer() return policy
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, seed: int = 0, ) -> TorchPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None) policy = TorchPolicy(seed, mock_spec, trainer_settings) return policy
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.reward_signals = { RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99) } trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=8, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPOCAOptimizer(policy, trainer_settings) return optimizer