예제 #1
0
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = trainer_config
    trainer_settings.reward_signals = reward_signal_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = NNPolicy(0,
                      mock_specs,
                      trainer_settings,
                      False,
                      "test",
                      False,
                      create_tf_graph=False)
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
    else:
        optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_parameters = trainer_config
    model_path = "testpath"
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["reward_signals"].update(reward_signal_config)
    trainer_parameters["use_recurrent"] = use_rnn
    policy = NNPolicy(0,
                      mock_brain,
                      trainer_parameters,
                      False,
                      False,
                      create_tf_graph=False)
    if trainer_parameters["trainer"] == "sac":
        optimizer = SACOptimizer(policy, trainer_parameters)
    else:
        optimizer = PPOOptimizer(policy, trainer_parameters)
    return optimizer
예제 #3
0
파일: trainer.py 프로젝트: yy404/ml-agents
 def add_policy(
     self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
 ) -> None:
     """
     Adds policy to trainer.
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(
                 self.__class__.__name__
             )
         )
     self.policy = policy
     self.policies[parsed_behavior_id.behavior_id] = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     # Assume steps were updated at the correct ratio before
     self.update_steps = int(max(1, self.step / self.steps_per_update))
     self.reward_signal_update_steps = int(
         max(1, self.step / self.reward_signal_steps_per_update)
     )
예제 #4
0
 def create_sac_optimizer(self) -> SACOptimizer:
     if self.framework == FrameworkType.PYTORCH:
         return TorchSACOptimizer(  # type: ignore
             cast(TorchPolicy, self.policy),
             self.trainer_settings  # type: ignore
         )  # type: ignore
     else:
         return SACOptimizer(  # type: ignore
             cast(TFPolicy, self.policy),
             self.trainer_settings  # type: ignore
         )  # type: ignore
예제 #5
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
예제 #6
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = NNPolicy(
        0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
    )
    optimizer = SACOptimizer(policy, trainer_settings)
    return optimizer
예제 #7
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
예제 #8
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_parameters = dummy_config
    model_path = "testmodel"
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["use_recurrent"] = use_rnn
    policy = NNPolicy(
        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
    )
    optimizer = SACOptimizer(policy, trainer_parameters)
    return optimizer
예제 #9
0
 def add_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     # Assume steps were updated at the correct ratio before
     self.update_steps = int(max(1, self.step / self.steps_per_update))
     self.reward_signal_update_steps = int(
         max(1, self.step / self.reward_signal_steps_per_update))
예제 #10
0
 def create_sac_optimizer(self) -> SACOptimizer:
     return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)