Exemplo n.º 1
0
 def __init__(
     self, m_size, normalize, use_recurrent, brain, seed, stream_names=None
 ):
     tf.set_random_seed(seed)
     self.brain = brain
     self.vector_in = None
     self.global_step, self.increment_step, self.steps_to_increment = (
         self.create_global_steps()
     )
     self.visual_in = []
     self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size")
     self.sequence_length = tf.placeholder(
         shape=None, dtype=tf.int32, name="sequence_length"
     )
     self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks")
     self.mask = tf.cast(self.mask_input, tf.int32)
     self.stream_names = stream_names or []
     self.use_recurrent = use_recurrent
     if self.use_recurrent:
         self.m_size = m_size
     else:
         self.m_size = 0
     self.normalize = normalize
     self.act_size = brain.vector_action_space_size
     self.vec_obs_size = brain.vector_observation_space_size
     self.vis_obs_size = brain.number_visual_observations
     tf.Variable(
         int(brain.vector_action_space_type == "continuous"),
         name="is_continuous_control",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(
         self._version_number_,
         name="version_number",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
     if brain.vector_action_space_type == "continuous":
         tf.Variable(
             self.act_size[0],
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     else:
         tf.Variable(
             sum(self.act_size),
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     self.value_heads: Dict[str, tf.Tensor] = {}
     self.normalization_steps: Optional[tf.Variable] = None
     self.running_mean: Optional[tf.Variable] = None
     self.running_variance: Optional[tf.Variable] = None
     self.update_normalization: Optional[tf.Operation] = None
     self.value: Optional[tf.Tensor] = None
Exemplo n.º 2
0
def test_average_gradients(mock_get_devices, dummy_config):
    tf.reset_default_graph()
    mock_get_devices.return_value = [
        "/device:GPU:0",
        "/device:GPU:1",
        "/device:GPU:2",
        "/device:GPU:3",
    ]

    trainer_parameters = dummy_config
    trainer_parameters["model_path"] = ""
    trainer_parameters["keep_checkpoints"] = 3
    brain = create_mock_brainparams()
    with tf.Session() as sess:
        policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
        var = tf.Variable(0)
        tower_grads = [
            [(tf.constant(0.1), var)],
            [(tf.constant(0.2), var)],
            [(tf.constant(0.3), var)],
            [(tf.constant(0.4), var)],
        ]
        avg_grads = policy.average_gradients(tower_grads)

        init = tf.global_variables_initializer()
        sess.run(init)
        run_out = sess.run(avg_grads)
    assert run_out == [(0.25, 0)]
Exemplo n.º 3
0
def test_tanh_distribution():
    with tf.Graph().as_default():
        logits = tf.Variable(initial_value=[[0, 0]],
                             trainable=True,
                             dtype=tf.float32)
        distribution = GaussianDistribution(logits,
                                            act_size=VECTOR_ACTION_SPACE,
                                            reparameterize=False,
                                            tanh_squash=True)
        sess = tf.Session()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            output = sess.run(distribution.sample)
            for _ in range(10):
                output = sess.run(
                    [distribution.sample, distribution.log_probs])
                for out in output:
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                # Assert action never exceeds [-1,1]
                action = output[0][0]
                for act in action:
                    assert act >= -1 and act <= 1
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
Exemplo n.º 4
0
 def create_schedule(
     schedule: ScheduleType,
     parameter: float,
     global_step: tf.Tensor,
     max_step: int,
     min_value: float,
 ) -> tf.Tensor:
     """
     Create a learning rate tensor.
     :param lr_schedule: Type of learning rate schedule.
     :param lr: Base learning rate.
     :param global_step: A TF Tensor representing the total global step.
     :param max_step: The maximum number of steps in the training run.
     :return: A Tensor containing the learning rate.
     """
     if schedule == ScheduleType.CONSTANT:
         parameter_rate = tf.Variable(parameter, trainable=False)
     elif schedule == ScheduleType.LINEAR:
         parameter_rate = tf.train.polynomial_decay(parameter,
                                                    global_step,
                                                    max_step,
                                                    min_value,
                                                    power=1.0)
     else:
         raise UnityTrainerException(f"The schedule {schedule} is invalid.")
     return parameter_rate
def test_gaussian_distribution():
    with tf.Graph().as_default():
        logits = tf.Variable(initial_value=[[1, 1]],
                             trainable=True,
                             dtype=tf.float32)
        distribution = GaussianDistribution(
            logits,
            act_size=VECTOR_ACTION_SPACE,
            reparameterize=False,
            tanh_squash=False,
        )
        sess = tf.Session()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            output = sess.run(distribution.sample)
            for _ in range(10):
                output = sess.run(
                    [distribution.sample, distribution.log_probs])
                for out in output:
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
            # Test entropy is correct
            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
                "log_std/BiasAdd:0")
            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
            # Entropy with log_std of 1.0 should be 2.42
            assert pytest.approx(entropy[0], 0.01) == 2.42
Exemplo n.º 6
0
    def create_loss(self, learning_rate: float, anneal_steps: int) -> None:
        """
        Creates the loss and update nodes for the BC module
        :param learning_rate: The learning rate for the optimizer
        :param anneal_steps: Number of steps over which to anneal the learning_rate
        """
        selected_action = self.policy.output
        if self.policy.use_continuous_act:
            self.loss = tf.reduce_mean(
                tf.squared_difference(selected_action, self.expert_action))
        else:
            log_probs = self.policy.all_log_probs
            self.loss = tf.reduce_mean(
                -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action)

        if anneal_steps > 0:
            self.annealed_learning_rate = tf.train.polynomial_decay(
                learning_rate,
                self.policy.global_step,
                anneal_steps,
                0.0,
                power=1.0)
        else:
            self.annealed_learning_rate = tf.Variable(learning_rate)

        optimizer = tf.train.AdamOptimizer(
            learning_rate=self.annealed_learning_rate, name="bc_adam")
        self.update_batch = optimizer.minimize(self.loss)
Exemplo n.º 7
0
 def create_learning_rate(
     lr_schedule: LearningRateSchedule,
     lr: float,
     global_step: tf.Tensor,
     max_step: int,
 ) -> tf.Tensor:
     """
     Create a learning rate tensor.
     :param lr_schedule: Type of learning rate schedule.
     :param lr: Base learning rate.
     :param global_step: A TF Tensor representing the total global step.
     :param max_step: The maximum number of steps in the training run.
     :return: A Tensor containing the learning rate.
     """
     if lr_schedule == LearningRateSchedule.CONSTANT:
         learning_rate = tf.Variable(lr)
     elif lr_schedule == LearningRateSchedule.LINEAR:
         learning_rate = tf.train.polynomial_decay(lr,
                                                   global_step,
                                                   max_step,
                                                   1e-10,
                                                   power=1.0)
     else:
         raise UnityTrainerException(
             "The learning rate schedule {} is invalid.".format(
                 lr_schedule))
     return learning_rate
def test_multicategorical_distribution():
    with tf.Graph().as_default():
        logits = tf.Variable(initial_value=[[0, 0]],
                             trainable=True,
                             dtype=tf.float32)
        action_masks = tf.Variable(
            initial_value=[[1 for _ in range(sum(DISCRETE_ACTION_SPACE))]],
            trainable=True,
            dtype=tf.float32,
        )
        distribution = MultiCategoricalDistribution(
            logits, act_size=DISCRETE_ACTION_SPACE, action_masks=action_masks)
        sess = tf.Session()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            output = sess.run(distribution.sample)
            for _ in range(10):
                sample, log_probs, entropy = sess.run([
                    distribution.sample, distribution.log_probs,
                    distribution.entropy
                ])
                assert len(log_probs[0]) == sum(DISCRETE_ACTION_SPACE)
                # Assert action never exceeds [-1,1]
                assert len(sample[0]) == len(DISCRETE_ACTION_SPACE)
                for i, act in enumerate(sample[0]):
                    assert act >= 0 and act <= DISCRETE_ACTION_SPACE[i]
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
                # Make sure entropy is correct
                assert entropy[0] > 3.8

            # Test masks
            mask = []
            for space in DISCRETE_ACTION_SPACE:
                mask.append(1)
                for _action_space in range(1, space):
                    mask.append(0)
            for _ in range(10):
                sample, log_probs = sess.run(
                    [distribution.sample, distribution.log_probs],
                    feed_dict={action_masks: [mask]},
                )
                for act in sample[0]:
                    assert act >= 0 and act <= 1
                output = sess.run([distribution.total_log_probs])
Exemplo n.º 9
0
 def create_global_steps():
     """Creates TF ops to track and increment global training step."""
     global_step = tf.Variable(
         0, name="global_step", trainable=False, dtype=tf.int32
     )
     steps_to_increment = tf.placeholder(
         shape=[], dtype=tf.int32, name="steps_to_increment"
     )
     increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment))
     return global_step, increment_step, steps_to_increment
Exemplo n.º 10
0
 def create_learning_rate(
     lr_schedule: LearningRateSchedule,
     lr: float,
     global_step: tf.Tensor,
     max_step: int,
 ) -> tf.Tensor:
     if lr_schedule == LearningRateSchedule.CONSTANT:
         learning_rate = tf.Variable(lr)
     elif lr_schedule == LearningRateSchedule.LINEAR:
         learning_rate = tf.train.polynomial_decay(
             lr, global_step, max_step, 1e-10, power=1.0
         )
     else:
         raise UnityTrainerException(
             "The learning rate schedule {} is invalid.".format(lr_schedule)
         )
     return learning_rate
Exemplo n.º 11
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
                self.behavior_spec.observation_shapes)
            if self.normalize:
                self.first_normalization_update = True
                normalization_tensors = ModelUtils.create_normalizer(
                    self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.init_normalization_op = normalization_tensors.init_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(shape=None,
                                                dtype=tf.int32,
                                                name="batch_size")
            self.sequence_length_ph = tf.placeholder(shape=None,
                                                     dtype=tf.int32,
                                                     name="sequence_length")
            self.mask_input = tf.placeholder(shape=[None],
                                             dtype=tf.float32,
                                             name="masks")
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                          dtype=tf.float32,
                                          name="epsilon")
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.behavior_spec.is_action_continuous()),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            int_version = TFPolicy._convert_version_string(__version__)
            major_ver_t = tf.Variable(
                int_version[0],
                name="trainer_major_version",
                trainable=False,
                dtype=tf.int32,
            )
            minor_ver_t = tf.Variable(
                int_version[1],
                name="trainer_minor_version",
                trainable=False,
                dtype=tf.int32,
            )
            patch_ver_t = tf.Variable(
                int_version[2],
                name="trainer_patch_version",
                trainable=False,
                dtype=tf.int32,
            )
            self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t)
            tf.Variable(
                MODEL_FORMAT_VERSION,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(self.m_size,
                        name="memory_size",
                        trainable=False,
                        dtype=tf.int32)
            if self.behavior_spec.is_action_continuous():
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
Exemplo n.º 12
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: Brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
                hyperparameters: SACSettings = cast(
                    SACSettings, trainer_params.hyperparameters)
                lr = hyperparameters.learning_rate
                lr_schedule = hyperparameters.learning_rate_schedule
                max_step = trainer_params.max_steps
                self.tau = hyperparameters.tau
                self.init_entcoef = hyperparameters.init_entcoef

                self.policy = policy
                self.act_size = policy.act_size
                policy_network_settings = policy.network_settings
                h_size = policy_network_settings.hidden_units
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type

                self.tau = hyperparameters.tau
                self.burn_in_ratio = 0.0

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
                    0.2  # Roughly equal to e-greedy 0.05
                )
                self.continuous_target_entropy_scale = 1.0

                stream_names = list(self.reward_signals.keys())
                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                self.gammas = [
                    _val.gamma
                    for _val in trainer_params.reward_signals.values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0)
                    for name in stream_names
                }
                self.disable_use_dones = {
                    name: self.use_dones_in_backup[name].assign(0.0)
                    for name in stream_names
                }

                if num_layers < 1:
                    num_layers = 1

                self.target_init_op: List[tf.Tensor] = []
                self.target_update_op: List[tf.Tensor] = []
                self.update_batch_policy: Optional[tf.Operation] = None
                self.update_batch_value: Optional[tf.Operation] = None
                self.update_batch_entropy: Optional[tf.Operation] = None

                self.policy_network = SACPolicyNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 3x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                self.target_network = SACTargetNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 1x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                self.m_size = 3 * self.policy.m_size
                self._create_inputs_and_outputs()
                self.learning_rate = ModelUtils.create_schedule(
                    lr_schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy_network.q1_heads,
                    self.policy_network.q2_heads,
                    lr,
                    int(max_step),
                    stream_names,
                    discrete=not self.policy.use_continuous_act,
                )
                self._create_sac_optimizer_ops()

                self.selected_actions = (self.policy.selected_actions
                                         )  # For GAIL and other reward signals
                if self.policy.normalize:
                    target_update_norm = self.target_network.copy_normalization(
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                    # Update the normalization of the optimizer when the policy does.
                    self.policy.update_normalization_op = tf.group([
                        self.policy.update_normalization_op, target_update_norm
                    ])

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
            "Policy/Learning Rate": "learning_rate",
        }

        self.update_dict = {
            "value_loss": self.total_value_loss,
            "policy_loss": self.policy_loss,
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
            "learning_rate": self.learning_rate,
        }
Exemplo n.º 13
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                self.brain.camera_resolutions
            )
            self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size)
            if self.normalize:
                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="batch_size"
            )
            self.sequence_length_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="sequence_length"
            )
            self.mask_input = tf.placeholder(
                shape=[None], dtype=tf.float32, name="masks"
            )
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
            )
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.brain.vector_action_space_type == "continuous"),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self._version_number_,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            if self.brain.vector_action_space_type == "continuous":
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
Exemplo n.º 14
0
    def __init__(
        self,
        brain,
        lr=1e-4,
        lr_schedule=LearningRateSchedule.CONSTANT,
        h_size=128,
        init_entcoef=0.1,
        max_step=5e6,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        m_size=None,
        seed=0,
        stream_names=None,
        tau=0.005,
        gammas=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: BrainInfo used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        self.tau = tau
        self.gammas = gammas
        self.brain = brain
        self.init_entcoef = init_entcoef
        if stream_names is None:
            stream_names = []
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.use_dones_in_backup = {
            name: tf.Variable(1.0)
            for name in stream_names
        }
        self.disable_use_dones = {
            name: self.use_dones_in_backup[name].assign(0.0)
            for name in stream_names
        }
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                               seed, stream_names)
        if num_layers < 1:
            num_layers = 1

        self.target_init_op: List[tf.Tensor] = []
        self.target_update_op: List[tf.Tensor] = []
        self.update_batch_policy: Optional[tf.Operation] = None
        self.update_batch_value: Optional[tf.Operation] = None
        self.update_batch_entropy: Optional[tf.Operation] = None

        self.policy_network = SACPolicyNetwork(
            brain=brain,
            m_size=m_size,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.target_network = SACTargetNetwork(
            brain=brain,
            m_size=m_size // 4 if m_size else None,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.create_inputs_and_outputs()
        self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                       self.global_step,
                                                       max_step)
        self.create_losses(
            self.policy_network.q1_heads,
            self.policy_network.q2_heads,
            lr,
            max_step,
            stream_names,
            discrete=self.brain.vector_action_space_type == "discrete",
        )

        self.selected_actions = (self.policy_network.selected_actions
                                 )  # For GAIL and other reward signals
        if normalize:
            target_update_norm = self.target_network.copy_normalization(
                self.policy_network.running_mean,
                self.policy_network.running_variance,
                self.policy_network.normalization_steps,
            )
            self.update_normalization = tf.group(
                [self.policy_network.update_normalization, target_update_norm])
            self.running_mean = self.policy_network.running_mean
            self.running_variance = self.policy_network.running_variance
            self.normalization_steps = self.policy_network.normalization_steps
Exemplo n.º 15
0
    def __init__(
        self,
        brain,
        h_size=128,
        lr=1e-4,
        n_layers=2,
        m_size=128,
        normalize=False,
        use_recurrent=False,
        seed=0,
    ):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
        num_streams = 1
        hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(
            dtype=tf.float32, shape=[], name="dropout_rate"
        )
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            self.memory_in = tf.placeholder(
                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
            )
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length
            )
            self.memory_out = tf.identity(self.memory_out, name="recurrent_out")

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_reg,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(0.01),
                    )
                )
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs",
            )
            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
                tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
            )
            tf.identity(normalized_logits, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action",
            )
            self.action_oh = tf.concat(
                [
                    tf.one_hot(self.true_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            self.loss = tf.reduce_sum(
                -tf.log(self.action_probs + 1e-10) * self.action_oh
            )
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                        self.sample_action,
                    ),
                    tf.float32,
                )
            )
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.act_size[0],
                activation=None,
                use_bias=False,
                name="pre_action",
                kernel_initializer=tf.initializers.variance_scaling(0.01),
            )
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action, name="action")
            self.true_action = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action"
            )
            self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action, self.sample_action)
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)