示例#1
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"])
    result = policy.get_action(brain_info_with_agents)
    assert result == ActionInfo(None, None, None, None, {})
示例#2
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    # Doesn't really matter what this is
    dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1)
    no_agent_step = BatchedStepResult.empty(dummy_groupspec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
示例#3
0
    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
        # for saving/swapping snapshots
        policy.init_load_weights()
        self.policies[name_behavior_id] = policy

        # First policy encountered
        if not self.learning_behavior_name:
            weights = policy.get_weights()
            self.current_policy_snapshot = weights
            self._save_snapshot(policy)
            self.trainer.add_policy(name_behavior_id, policy)
            self.learning_behavior_name = name_behavior_id
示例#4
0
def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = BatchedStepResult(
        [],
        np.array([], dtype=np.float32),
        np.array([False], dtype=np.bool),
        np.array([], dtype=np.bool),
        np.array([0]),
        None,
    )
    result = policy.get_action(step_with_agents, worker_id=0)
    assert result == ActionInfo(None, None, {}, [0])
示例#5
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    brain_info_with_agents = BrainInfo([], [], [],
                                       agents=["an-agent-id"],
                                       local_done=[False])
    result = policy.get_action(brain_info_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out)
    assert result == expected
示例#6
0
 def _save_snapshot(self, policy: TFPolicy) -> None:
     weights = policy.get_weights()
     try:
         self.policy_snapshots[self.snapshot_counter] = weights
     except IndexError:
         self.policy_snapshots.append(weights)
     self.policy_elos[self.snapshot_counter] = self.current_elo
     self.snapshot_counter = (self.snapshot_counter + 1) % self.window
示例#7
0
    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
        """
        Adds policy to trainer. For the first policy added, add a trainer
        to the policy and set the learning behavior name to name_behavior_id.
        :param name_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
        self.policies[name_behavior_id] = policy
        policy.create_tf_graph()

        # First policy encountered
        if not self.learning_behavior_name:
            weights = policy.get_weights()
            self.current_policy_snapshot = weights
            self.trainer.add_policy(name_behavior_id, policy)
            self._save_snapshot(policy)  # Need to save after trainer initializes policy
            self.learning_behavior_name = name_behavior_id
        else:
            # for saving/swapping snapshots
            policy.init_load_weights()
示例#8
0
def test_take_action_returns_action_info_when_available():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
    policy.evaluate = MagicMock(return_value=policy_eval_out)
    step_with_agents = BatchedStepResult(
        [],
        np.array([], dtype=np.float32),
        np.array([False], dtype=np.bool),
        np.array([], dtype=np.bool),
        np.array([0]),
        None,
    )
    result = policy.get_action(step_with_agents)
    expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"],
                          policy_eval_out, [0])
    assert result == expected
示例#9
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, SACPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.step = policy.get_current_step()
示例#10
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
示例#11
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param name_behavior_id: Behavior ID that the policy should belong to.
     :param policy: Policy to associate with name_behavior_id.
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-NNPolicy passed to PPOTrainer.add_policy()")
     self.policy = policy
     self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
示例#12
0
    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)

                lr = float(trainer_params["learning_rate"])
                lr_schedule = LearningRateSchedule(
                    trainer_params.get("learning_rate_schedule", "linear"))
                h_size = int(trainer_params["hidden_units"])
                epsilon = float(trainer_params["epsilon"])
                beta = float(trainer_params["beta"])
                max_step = float(trainer_params["max_steps"])
                num_layers = int(trainer_params["num_layers"])
                vis_encode_type = EncoderType(
                    trainer_params.get("vis_encode_type", "simple"))
                self.burn_in_ratio = float(
                    trainer_params.get("burn_in_ratio", 0.0))

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_learning_rate(
                    lr_schedule, lr, self.policy.global_step, int(max_step))
                self._create_losses(
                    self.policy.log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
            })

            self.policy.initialize_or_load()
示例#13
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
    no_agent_brain_info = BrainInfo([], [], [], agents=[])
    result = policy.get_action(no_agent_brain_info)
    assert result == ActionInfo([], [], None)
示例#14
0
    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: Brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
                lr = float(trainer_params["learning_rate"])
                lr_schedule = LearningRateSchedule(
                    trainer_params.get("learning_rate_schedule", "constant"))
                self.policy = policy
                self.act_size = self.policy.act_size
                h_size = int(trainer_params["hidden_units"])
                max_step = float(trainer_params["max_steps"])
                num_layers = int(trainer_params["num_layers"])
                vis_encode_type = EncoderType(
                    trainer_params.get("vis_encode_type", "simple"))
                self.tau = trainer_params.get("tau", 0.005)
                self.burn_in_ratio = float(
                    trainer_params.get("burn_in_ratio", 0.0))

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
                    0.2)  # Roughly equal to e-greedy 0.05
                self.continuous_target_entropy_scale = 1.0

                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
                stream_names = list(self.reward_signals.keys())
                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                self.gammas = [
                    _val["gamma"]
                    for _val in trainer_params["reward_signals"].values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0)
                    for name in stream_names
                }
                self.disable_use_dones = {
                    name: self.use_dones_in_backup[name].assign(0.0)
                    for name in stream_names
                }

                if num_layers < 1:
                    num_layers = 1

                self.target_init_op: List[tf.Tensor] = []
                self.target_update_op: List[tf.Tensor] = []
                self.update_batch_policy: Optional[tf.Operation] = None
                self.update_batch_value: Optional[tf.Operation] = None
                self.update_batch_entropy: Optional[tf.Operation] = None

                self.policy_network = SACPolicyNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 3x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                self.target_network = SACTargetNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 1x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                self.m_size = 3 * self.policy.m_size
                self._create_inputs_and_outputs()
                self.learning_rate = ModelUtils.create_learning_rate(
                    lr_schedule, lr, self.policy.global_step, int(max_step))
                self._create_losses(
                    self.policy_network.q1_heads,
                    self.policy_network.q2_heads,
                    lr,
                    int(max_step),
                    stream_names,
                    discrete=not self.policy.use_continuous_act,
                )
                self._create_sac_optimizer_ops()

                self.selected_actions = (self.policy.selected_actions
                                         )  # For GAIL and other reward signals
                if self.policy.normalize:
                    target_update_norm = self.target_network.copy_normalization(
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                    # Update the normalization of the optimizer when the policy does.
                    self.policy.update_normalization_op = tf.group([
                        self.policy.update_normalization_op, target_update_norm
                    ])

                self.policy.initialize_or_load()

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
            "Policy/Learning Rate": "learning_rate",
        }

        self.update_dict = {
            "value_loss": self.total_value_loss,
            "policy_loss": self.policy_loss,
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
            "entropy": self.policy.entropy,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
            "learning_rate": self.learning_rate,
        }