def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = trainer_config trainer_settings.reward_signals = reward_signal_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = NNPolicy(0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False) if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: optimizer = PPOOptimizer(policy, trainer_settings) return optimizer
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_parameters = trainer_config model_path = "testpath" trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["reward_signals"].update(reward_signal_config) trainer_parameters["use_recurrent"] = use_rnn policy = NNPolicy(0, mock_brain, trainer_parameters, False, False, create_tf_graph=False) if trainer_parameters["trainer"] == "sac": optimizer = SACOptimizer(policy, trainer_parameters) else: optimizer = PPOOptimizer(policy, trainer_parameters) return optimizer
def add_policy( self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy ) -> None: """ Adds policy to trainer. """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format( self.__class__.__name__ ) ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update) )
def create_sac_optimizer(self) -> SACOptimizer: if self.framework == FrameworkType.PYTORCH: return TorchSACOptimizer( # type: ignore cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore ) # type: ignore else: return SACOptimizer( # type: ignore cast(TFPolicy, self.policy), self.trainer_settings # type: ignore ) # type: ignore
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = NNPolicy( 0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False ) optimizer = SACOptimizer(policy, trainer_settings) return optimizer
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_parameters = dummy_config model_path = "testmodel" trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["use_recurrent"] = use_rnn policy = NNPolicy( 0, mock_brain, trainer_parameters, False, False, create_tf_graph=False ) optimizer = SACOptimizer(policy, trainer_parameters) return optimizer
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update))
def create_sac_optimizer(self) -> SACOptimizer: return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)