def __init__(
     self,
     brain,
     lr=1e-4,
     lr_schedule=LearningRateSchedule.LINEAR,
     h_size=128,
     epsilon=0.2,
     beta=1e-3,
     max_step=5e6,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     m_size=None,
     seed=0,
     stream_names=None,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param lr_schedule: Learning rate decay schedule.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     :param seed: Seed to use for initialization of model.
     :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
     :return: a sub-class of PPOAgent tailored to the environment.
     """
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                            seed, stream_names)
     if num_layers < 1:
         num_layers = 1
     if brain.vector_action_space_type == "continuous":
         self.create_cc_actor_critic(h_size, num_layers, vis_encode_type)
         self.entropy = tf.ones_like(tf.reshape(self.value,
                                                [-1])) * self.entropy
     else:
         self.create_dc_actor_critic(h_size, num_layers, vis_encode_type)
     self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                    self.global_step,
                                                    max_step)
     self.create_losses(
         self.log_probs,
         self.old_log_probs,
         self.value_heads,
         self.entropy,
         beta,
         epsilon,
         lr,
         max_step,
     )
示例#2
0
    def __init__(
        self,
        brain,
        m_size=None,
        h_size=128,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        stream_names=None,
        seed=0,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        LearningModel.__init__(
            self, m_size, normalize, use_recurrent, brain, seed, stream_names
        )
        self.normalize = normalize
        self.use_recurrent = use_recurrent
        self.num_layers = num_layers
        self.stream_names = stream_names
        self.h_size = h_size
        self.activ_fn = self.swish

        self.policy_memory_in: Optional[tf.Tensor] = None
        self.policy_memory_out: Optional[tf.Tensor] = None
        self.value_memory_in: Optional[tf.Tensor] = None
        self.value_memory_out: Optional[tf.Tensor] = None
        self.q1: Optional[tf.Tensor] = None
        self.q2: Optional[tf.Tensor] = None
        self.q1_p: Optional[tf.Tensor] = None
        self.q2_p: Optional[tf.Tensor] = None
        self.q1_memory_in: Optional[tf.Tensor] = None
        self.q2_memory_in: Optional[tf.Tensor] = None
        self.q1_memory_out: Optional[tf.Tensor] = None
        self.q2_memory_out: Optional[tf.Tensor] = None
        self.action_holder: Optional[tf.Tensor] = None
        self.prev_action: Optional[tf.Tensor] = None
        self.action_masks: Optional[tf.Tensor] = None
        self.external_action_in: Optional[tf.Tensor] = None
        self.log_sigma_sq: Optional[tf.Tensor] = None
        self.entropy: Optional[tf.Tensor] = None
        self.deterministic_output: Optional[tf.Tensor] = None
        self.all_log_probs: Optional[tf.Tensor] = None
        self.normalized_logprobs: Optional[tf.Tensor] = None
        self.action_probs: Optional[tf.Tensor] = None
        self.selected_actions: Optional[tf.Tensor] = None
        self.output: Optional[tf.Tensor] = None
        self.output_oh: Optional[tf.Tensor] = None
        self.output_pre: Optional[tf.Tensor] = None

        self.value_vars = None
        self.q_vars = None
        self.critic_vars = None
        self.policy_vars = None

        self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
        self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
        self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
        self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
示例#3
0
 def __init__(self,
              brain,
              lr=1e-4,
              h_size=128,
              epsilon=0.2,
              beta=1e-3,
              max_step=5e6,
              normalize=False,
              use_recurrent=False,
              num_layers=2,
              m_size=None,
              use_curiosity=False,
              curiosity_strength=0.01,
              curiosity_enc_size=128,
              scope='Model',
              seed=0):
     """
     Takes a Unity environment and model-specific hyper-parameters and returns the
     appropriate PPO agent model for the environment.
     :param brain: BrainInfo used to generate specific network graph.
     :param lr: Learning rate.
     :param h_size: Size of hidden layers
     :param epsilon: Value for policy-divergence threshold.
     :param beta: Strength of entropy regularization.
     :return: a sub-class of PPOAgent tailored to the environment.
     :param max_step: Total number of training steps.
     :param normalize: Whether to normalize vector observation input.
     :param use_recurrent: Whether to use an LSTM layer in the network.
     :param num_layers Number of hidden layers between encoded input and policy & value layers
     :param m_size: Size of brain memory.
     """
     with tf.variable_scope(scope):
         LearningModel.__init__(self, m_size, normalize, use_recurrent,
                                brain, seed)
         self.use_curiosity = use_curiosity
         if num_layers < 1:
             num_layers = 1
         self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder(
         )
         if brain.vector_action_space_type == "continuous":
             self.create_cc_actor_critic(h_size, num_layers)
             self.entropy = tf.ones_like(tf.reshape(self.value,
                                                    [-1])) * self.entropy
         else:
             self.create_dc_actor_critic(h_size, num_layers)
         if self.use_curiosity:
             self.curiosity_enc_size = curiosity_enc_size
             self.curiosity_strength = curiosity_strength
             encoded_state, encoded_next_state = self.create_curiosity_encoders(
             )
             self.create_inverse_model(encoded_state, encoded_next_state)
             self.create_forward_model(encoded_state, encoded_next_state)
         self.create_ppo_optimizer(self.log_probs, self.old_log_probs,
                                   self.value, self.entropy, beta, epsilon,
                                   lr, max_step)
示例#4
0
def test_min_visual_size():
    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
    assert set(
        LearningModel.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)

    for encoder_type in EncoderType:
        with tf.Graph().as_default():
            good_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
            good_res = CameraResolution(width=good_size,
                                        height=good_size,
                                        num_channels=3)
            LearningModel._check_resolution_for_encoder(good_res, encoder_type)
            vis_input = LearningModel.create_visual_input(
                good_res, "test_min_visual_size")
            enc_func = LearningModel.get_encoder_for_type(encoder_type)
            enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)

        # Anything under the min size should raise an exception. If not, decrease the min size!
        with pytest.raises(Exception):
            with tf.Graph().as_default():
                bad_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[
                    encoder_type] - 1
                bad_res = CameraResolution(width=bad_size,
                                           height=bad_size,
                                           num_channels=3)

                with pytest.raises(UnityTrainerException):
                    # Make sure we'd hit a friendly error during model setup time.
                    LearningModel._check_resolution_for_encoder(
                        bad_res, encoder_type)

                vis_input = LearningModel.create_visual_input(
                    bad_res, "test_min_visual_size")
                enc_func = LearningModel.get_encoder_for_type(encoder_type)
                enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)
示例#5
0
    def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128,
                normalize=False, use_recurrent=False, seed=0):
                LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
                num_streams = 1
                hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
                hidden = hidden_streams[0]
                self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
                hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
                if self.use_recurrent:
                    tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
                    self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
                    hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in,
                                                                                            self.sequence_length)
                    self.memory_out = tf.identify(self.memory_out, name='recurrent_out')

                if brain.vector_action_space_type == "discrete":
                    policy_branches = []
                    for size in self.act_size:
                        policy_branches.append(
                            tf.layers.dense(
                                hidden,
                                size,
                                activation=None,
                                use_bias=False,
                                kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)))
                    self.action_probs = tf.concat(
                        [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
                    self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks")
                    self.sample_action_float, normalized_logits = self.create_discrete_action_masking_layer(
                        tf.concat(policy_branches, axis=1), self.action_masks, self.act_size)
                    tf.identity(normalized_logits, name='action')
                    self.sample_action = tf.cast(self.sample_action_float, tf.int32)
                    self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")
                    self.action_oh = tf.concat([
                        tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size))], axis=1)
                    self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh)
                    self.action_percent = tf.reduce_mean(tf.cast(
                        tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32))
                else:
                    self.policy = tf.layers.dense(hidden_reg, self.act_size[0], activation=None, use_bias=False, name='pre_action',
                                                  kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
                    self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
                    self.sample_action = tf.identity(self.clipped_sample_action, name="action")
                    self.true_action = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action")
                    self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
                    self.loss = tf.reduce_sum(tf.squared_difference(self.clipped_true_action, self.sample_action))

                optimizer = tf.train.AdamOptimizer(learning_rate=lr)
                self.update = optimizer.minimize(self.loss)
示例#6
0
    def create_encoder(
        self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool
    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        """
        Creates the encoder for the discriminator
        :param state_in: The encoded observation input
        :param action_in: The action input
        :param done_in: The done flags input
        :param reuse: If true, the weights will be shared with the previous encoder created
        """
        with tf.variable_scope("GAIL_model"):
            if self.use_actions:
                concat_input = tf.concat([state_in, action_in, done_in], axis=1)
            else:
                concat_input = state_in

            hidden_1 = tf.layers.dense(
                concat_input,
                self.h_size,
                activation=LearningModel.swish,
                name="gail_d_hidden_1",
                reuse=reuse,
            )

            hidden_2 = tf.layers.dense(
                hidden_1,
                self.h_size,
                activation=LearningModel.swish,
                name="gail_d_hidden_2",
                reuse=reuse,
            )

            z_mean = None
            if self.use_vail:
                # Latent representation
                z_mean = tf.layers.dense(
                    hidden_2,
                    self.z_size,
                    reuse=reuse,
                    name="gail_z_mean",
                    kernel_initializer=LearningModel.scaled_init(0.01),
                )

                self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)

                # Sampled latent code
                self.z = z_mean + self.z_sigma * self.noise * self.use_noise
                estimate_input = self.z
            else:
                estimate_input = hidden_2

            estimate = tf.layers.dense(
                estimate_input,
                1,
                activation=tf.nn.sigmoid,
                name="gail_d_estimate",
                reuse=reuse,
            )
            return estimate, z_mean, concat_input
示例#7
0
 def __init__(
     self,
     brain,
     m_size=None,
     h_size=128,
     normalize=False,
     use_recurrent=False,
     num_layers=2,
     stream_names=None,
     seed=0,
     vis_encode_type=EncoderType.SIMPLE,
 ):
     LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                            seed, stream_names)
     self.normalize = normalize
     self.use_recurrent = use_recurrent
     self.num_layers = num_layers
     self.stream_names = stream_names
     self.h_size = h_size
     self.activ_fn = self.swish
示例#8
0
    def __init__(
        self,
        brain,
        lr=1e-4,
        lr_schedule=LearningRateSchedule.CONSTANT,
        h_size=128,
        init_entcoef=0.1,
        max_step=5e6,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        m_size=None,
        seed=0,
        stream_names=None,
        tau=0.005,
        gammas=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: BrainInfo used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        self.tau = tau
        self.gammas = gammas
        self.brain = brain
        self.init_entcoef = init_entcoef
        if stream_names is None:
            stream_names = []
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.use_dones_in_backup = {
            name: tf.Variable(1.0)
            for name in stream_names
        }
        self.disable_use_dones = {
            name: self.use_dones_in_backup[name].assign(0.0)
            for name in stream_names
        }
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                               seed, stream_names)
        if num_layers < 1:
            num_layers = 1

        self.target_init_op: List[tf.Tensor] = []
        self.target_update_op: List[tf.Tensor] = []
        self.update_batch_policy: Optional[tf.Operation] = None
        self.update_batch_value: Optional[tf.Operation] = None
        self.update_batch_entropy: Optional[tf.Operation] = None

        self.policy_network = SACPolicyNetwork(
            brain=brain,
            m_size=m_size,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.target_network = SACTargetNetwork(
            brain=brain,
            m_size=m_size // 4 if m_size else None,
            h_size=h_size,
            normalize=normalize,
            use_recurrent=use_recurrent,
            num_layers=num_layers,
            seed=seed,
            stream_names=stream_names,
            vis_encode_type=vis_encode_type,
        )
        self.create_inputs_and_outputs()
        self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                       self.global_step,
                                                       max_step)
        self.create_losses(
            self.policy_network.q1_heads,
            self.policy_network.q2_heads,
            lr,
            max_step,
            stream_names,
            discrete=self.brain.vector_action_space_type == "discrete",
        )

        self.selected_actions = (self.policy_network.selected_actions
                                 )  # For GAIL and other reward signals
        if normalize:
            target_update_norm = self.target_network.copy_normalization(
                self.policy_network.running_mean,
                self.policy_network.running_variance,
                self.policy_network.normalization_steps,
            )
            self.update_normalization = tf.group(
                [self.policy_network.update_normalization, target_update_norm])
            self.running_mean = self.policy_network.running_mean
            self.running_variance = self.policy_network.running_variance
            self.normalization_steps = self.policy_network.normalization_steps
示例#9
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)
示例#10
0
    def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Creates state encoders for current and future observations.
        Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction
        See https://arxiv.org/abs/1705.05363 for more details.
        :return: current and future state encoder tensors.
        """
        encoded_state_list = []
        encoded_next_state_list = []

        if self.policy_model.vis_obs_size > 0:
            self.next_visual_in = []
            visual_encoders = []
            next_visual_encoders = []
            for i in range(self.policy_model.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                next_visual_input = LearningModel.create_visual_input(
                    self.policy_model.brain.camera_resolutions[i],
                    name="curiosity_next_visual_observation_" + str(i),
                )
                self.next_visual_in.append(next_visual_input)

                # Create the encoder ops for current and next visual input.
                # Note that these encoders are siamese.
                encoded_visual = self.policy_model.create_visual_observation_encoder(
                    self.policy_model.visual_in[i],
                    self.encoding_size,
                    LearningModel.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_next_visual = self.policy_model.create_visual_observation_encoder(
                    self.next_visual_in[i],
                    self.encoding_size,
                    LearningModel.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_encoders.append(encoded_visual)
                next_visual_encoders.append(encoded_next_visual)

            hidden_visual = tf.concat(visual_encoders, axis=1)
            hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
            encoded_state_list.append(hidden_visual)
            encoded_next_state_list.append(hidden_next_visual)

        if self.policy_model.vec_obs_size > 0:
            # Create the encoder ops for current and next vector input.
            # Note that these encoders are siamese.
            # Create input op for next (t+1) vector observation.
            self.next_vector_in = tf.placeholder(
                shape=[None, self.policy_model.vec_obs_size],
                dtype=tf.float32,
                name="curiosity_next_vector_observation",
            )

            encoded_vector_obs = self.policy_model.create_vector_observation_encoder(
                self.policy_model.vector_in,
                self.encoding_size,
                LearningModel.swish,
                2,
                "curiosity_vector_obs_encoder",
                False,
            )
            encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder(
                self.next_vector_in,
                self.encoding_size,
                LearningModel.swish,
                2,
                "curiosity_vector_obs_encoder",
                True,
            )
            encoded_state_list.append(encoded_vector_obs)
            encoded_next_state_list.append(encoded_next_vector_obs)

        encoded_state = tf.concat(encoded_state_list, axis=1)
        encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
        return encoded_state, encoded_next_state
示例#11
0
    def create_cc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            2, h_size, num_layers, vis_encode_type)

        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            _half_point = int(self.m_size / 2)
            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
                hidden_streams[0],
                self.memory_in[:, :_half_point],
                self.sequence_length,
                name="lstm_policy",
            )

            hidden_value, memory_value_out = self.create_recurrent_encoder(
                hidden_streams[1],
                self.memory_in[:, _half_point:],
                self.sequence_length,
                name="lstm_value",
            )
            self.memory_out = tf.concat([memory_policy_out, memory_value_out],
                                        axis=1,
                                        name="recurrent_out")
        else:
            hidden_policy = hidden_streams[0]
            hidden_value = hidden_streams[1]

        mu = tf.layers.dense(
            hidden_policy,
            self.act_size[0],
            activation=None,
            kernel_initializer=LearningModel.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        self.log_sigma_sq = tf.get_variable(
            "log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                      dtype=tf.float32,
                                      name="epsilon")
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) /
                     sigma_sq - 0.5 * tf.log(2.0 * np.pi) -
                     0.5 * self.log_sigma_sq)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq)

        self.create_value_heads(self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]],
                                                dtype=tf.float32,
                                                name="old_probabilities")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)
        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
示例#12
0
    def create_dc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            1, h_size, num_layers, vis_encode_type)
        hidden = hidden_streams[0]

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden = tf.concat([hidden, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden, memory_out = self.create_recurrent_encoder(
                hidden, self.memory_in, self.sequence_length)
            self.memory_out = tf.identity(memory_out, name="recurrent_out")

        policy_branches = []
        for size in self.act_size:
            policy_branches.append(
                tf.layers.dense(
                    hidden,
                    size,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=LearningModel.scaled_init(0.01),
                ))

        self.all_log_probs = tf.concat(policy_branches,
                                       axis=1,
                                       name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, _, normalized_logits = self.create_discrete_action_masking_layer(
            self.all_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.normalized_logits = tf.identity(normalized_logits, name="action")

        self.create_value_heads(self.stream_names, hidden)

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)],
            dtype=tf.float32,
            name="old_probabilities")
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )