def add_policy( self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy ) -> None: """ Adds policy to trainer. """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format( self.__class__.__name__ ) ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update) )
def test_step_overflow(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), create_tf_graph=False, ) policy.create_input_placeholders() policy.initialize() policy.set_step(2**31 - 1) assert policy.get_current_step() == 2**31 - 1 policy.increment_step(3) assert policy.get_current_step() == 2**31 + 2
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. :param parsed_behavior_id: Behavior identifiers that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy self.optimizer = PPOOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-NNPolicy passed to PPOTrainer.add_policy()") self.policy = policy self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update))