예제 #1
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            distribution = GaussianDistribution(
                hidden_policy,
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs,
                                         name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs
예제 #2
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        with tf.variable_scope("policy"):
            distribution = MultiCategoricalDistribution(
                hidden_policy, self.act_size, self.action_masks)
        # It's important that we are able to feed_dict a value into this tensor to get the
        # right one-hot encoding, so we can't do identity on it.
        self.output = distribution.sample
        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
        self.selected_actions = tf.stop_gradient(
            distribution.sample_onehot)  # In discrete, these are onehot
        self.entropy = distribution.entropy
        self.total_log_probs = distribution.total_log_probs
예제 #3
0
    def _create_inputs_and_outputs(self) -> None:
        """
        Assign the higher-level SACModel's inputs and outputs to those of its policy or
        target network.
        """
        self.vector_in = self.policy.vector_in
        self.visual_in = self.policy.visual_in
        self.next_vector_in = self.target_network.vector_in
        self.next_visual_in = self.target_network.visual_in
        self.sequence_length_ph = self.policy.sequence_length_ph
        self.next_sequence_length_ph = self.target_network.sequence_length_ph
        if not self.policy.use_continuous_act:
            self.action_masks = self.policy_network.action_masks
        else:
            self.output_pre = self.policy_network.output_pre

        # Don't use value estimate during inference.
        self.value = tf.identity(self.policy_network.value,
                                 name="value_estimate_unused")
        self.value_heads = self.policy_network.value_heads
        self.dones_holder = tf.placeholder(shape=[None],
                                           dtype=tf.float32,
                                           name="dones_holder")

        if self.policy.use_recurrent:
            self.memory_in = self.policy_network.memory_in
            self.memory_out = self.policy_network.memory_out
            if not self.policy.use_continuous_act:
                self.prev_action = self.policy_network.prev_action
            self.next_memory_in = self.target_network.memory_in
예제 #4
0
    def create_inputs_and_outputs(self):
        """
        Assign the higher-level SACModel's inputs and outputs to those of its policy or
        target network.
        """
        self.vector_in = self.policy_network.vector_in
        self.visual_in = self.policy_network.visual_in
        self.next_vector_in = self.target_network.vector_in
        self.next_visual_in = self.target_network.visual_in
        self.action_holder = self.policy_network.action_holder
        self.sequence_length = self.policy_network.sequence_length
        self.next_sequence_length = self.target_network.sequence_length
        if self.brain.vector_action_space_type == "discrete":
            self.action_masks = self.policy_network.action_masks
        else:
            self.output_pre = self.policy_network.output_pre

        self.output = self.policy_network.output
        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
        self.value = tf.identity(
            self.policy_network.value, name="value_estimate_unused"
        )
        self.value_heads = self.policy_network.value_heads
        self.all_log_probs = self.policy_network.all_log_probs
        self.dones_holder = tf.placeholder(
            shape=[None], dtype=tf.float32, name="dones_holder"
        )
        # This is just a dummy to get pretraining to work. PPO has this but SAC doesn't.
        # TODO: Proper input and output specs for models
        self.epsilon = tf.placeholder(
            shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
        )
        if self.use_recurrent:
            self.memory_in = self.policy_network.memory_in
            self.memory_out = self.policy_network.memory_out

            # For Barracuda
            self.inference_memory_out = tf.identity(
                self.policy_network.policy_memory_out, name="recurrent_out"
            )

            if self.brain.vector_action_space_type == "discrete":
                self.prev_action = self.policy_network.prev_action
            self.next_memory_in = self.target_network.memory_in
예제 #5
0
    def _create_cc_critic(
        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
    ) -> None:
        """
        Creates Continuous control critic (value) network.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = ModelUtils.create_observation_streams(
            self.policy.visual_in,
            self.policy.processed_vector_in,
            1,
            h_size,
            num_layers,
            vis_encode_type,
        )[0]

        if self.policy.use_recurrent:
            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
                hidden_stream,
                self.memory_in,
                self.policy.sequence_length_ph,
                name="lstm_value",
            )
            self.memory_out = memory_value_out
        else:
            hidden_value = hidden_stream

        self.value_heads, self.value = ModelUtils.create_value_heads(
            self.stream_names, hidden_value
        )
        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.policy.act_size)],
            dtype=tf.float32,
            name="old_probabilities",
        )

        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
        )
예제 #6
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        if self.policy.behavior_spec.action_spec.is_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.float32)
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(shape=[None, action_length],
                                                   dtype=tf.int32)
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )
예제 #7
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.brain.vector_action_space_type == "continuous":
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        if self.policy.vec_obs_size > 0:
            self.obs_in_expert = tf.placeholder(
                shape=[None, self.policy.vec_obs_size], dtype=tf.float32
            )
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.policy.vis_obs_size > 0:
            self.expert_visual_in: List[tf.Tensor] = []
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i in range(self.policy.vis_obs_size):
                # Create input ops for next (t+1) visual observations.
                visual_input = ModelUtils.create_visual_input(
                    self.policy.brain.camera_resolutions[i],
                    name="gail_visual_observation_" + str(i),
                )
                self.expert_visual_in.append(visual_input)

                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    self.policy.visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    self.expert_visual_in[i],
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
예제 #8
0
    def create_dc_actor(self, hidden_policy, scope):
        """
        Creates Discrete control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        scope = self.join_scopes(scope, "policy")

        # Create inputs outside of the scope
        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1)

            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(
                            0.01),
                    ))
            all_logits = tf.concat(policy_branches,
                                   axis=1,
                                   name="action_probs")

            output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer(
                all_logits, self.action_masks, self.act_size)

            self.action_probs = normalized_probs

            # Really, this is entropy, but it has an analogous purpose to the log probs in the
            # continuous case.
            self.all_log_probs = self.action_probs * normalized_logprobs
            self.output = output

            # Create action input (discrete)
            self.action_holder = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="action_holder")

            self.output_oh = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # For Curiosity and GAIL to retrieve selected actions. We don't
            # need the mask at this point because it's already stored in the buffer.
            self.selected_actions = tf.stop_gradient(self.output_oh)

            self.external_action_in = tf.concat(
                [
                    tf.one_hot(self.action_holder[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )

            # This is total entropy over all branches
            self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1)

        # Extract the normalized logprobs for Barracuda
        self.normalized_logprobs = tf.identity(normalized_logprobs,
                                               name="action")

        # We kept the LSTMs at a different scope than the rest, so add them if they exist.
        self.policy_vars = self.get_vars(scope)
        if self.use_recurrent:
            self.policy_vars += self.get_vars("lstm")
예제 #9
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)
예제 #10
0
    def make_inputs(self) -> None:
        """
        Creates the input layers for the discriminator
        """
        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy.behavior_spec.is_action_continuous():
            action_length = self.policy.act_size[0]
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.float32
            )
            self.expert_action = tf.identity(self.action_in_expert)
        else:
            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
            self.expert_action = tf.concat(
                [
                    tf.one_hot(self.action_in_expert[:, i], act_size)
                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

        encoded_policy_list = []
        encoded_expert_list = []

        (
            self.obs_in_expert,
            self.expert_visual_in,
        ) = ModelUtils.create_input_placeholders(
            self.policy.behavior_spec.observation_shapes, "gail_"
        )

        if self.policy.vec_obs_size > 0:
            if self.policy.normalize:
                encoded_expert_list.append(
                    ModelUtils.normalize_vector_obs(
                        self.obs_in_expert,
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                )
                encoded_policy_list.append(self.policy.processed_vector_in)
            else:
                encoded_expert_list.append(self.obs_in_expert)
                encoded_policy_list.append(self.policy.vector_in)

        if self.expert_visual_in:
            visual_policy_encoders = []
            visual_expert_encoders = []
            for i, (vis_in, exp_vis_in) in enumerate(
                zip(self.policy.visual_in, self.expert_visual_in)
            ):
                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
                    vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    False,
                )

                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
                    exp_vis_in,
                    self.encoding_size,
                    ModelUtils.swish,
                    1,
                    f"gail_stream_{i}_visual_obs_encoder",
                    True,
                )
                visual_policy_encoders.append(encoded_policy_visual)
                visual_expert_encoders.append(encoded_expert_visual)
            hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
            hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
            encoded_policy_list.append(hidden_policy_visual)
            encoded_expert_list.append(hidden_expert_visual)

        self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
        self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
예제 #11
0
    def __init__(
        self,
        brain,
        h_size=128,
        lr=1e-4,
        n_layers=2,
        m_size=128,
        normalize=False,
        use_recurrent=False,
        seed=0,
    ):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
        num_streams = 1
        hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(
            dtype=tf.float32, shape=[], name="dropout_rate"
        )
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            self.memory_in = tf.placeholder(
                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
            )
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length
            )
            self.memory_out = tf.identity(self.memory_out, name="recurrent_out")

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_reg,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(0.01),
                    )
                )
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs",
            )
            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
                tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
            )
            tf.identity(normalized_logits, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action",
            )
            self.action_oh = tf.concat(
                [
                    tf.one_hot(self.true_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            self.loss = tf.reduce_sum(
                -tf.log(self.action_probs + 1e-10) * self.action_oh
            )
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                        self.sample_action,
                    ),
                    tf.float32,
                )
            )
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.act_size[0],
                activation=None,
                use_bias=False,
                name="pre_action",
                kernel_initializer=tf.initializers.variance_scaling(0.01),
            )
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action, name="action")
            self.true_action = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action"
            )
            self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action, self.sample_action)
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
예제 #12
0
    def create_cc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            2, h_size, num_layers, vis_encode_type)

        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            _half_point = int(self.m_size / 2)
            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
                hidden_streams[0],
                self.memory_in[:, :_half_point],
                self.sequence_length,
                name="lstm_policy",
            )

            hidden_value, memory_value_out = self.create_recurrent_encoder(
                hidden_streams[1],
                self.memory_in[:, _half_point:],
                self.sequence_length,
                name="lstm_value",
            )
            self.memory_out = tf.concat([memory_policy_out, memory_value_out],
                                        axis=1,
                                        name="recurrent_out")
        else:
            hidden_policy = hidden_streams[0]
            hidden_value = hidden_streams[1]

        mu = tf.layers.dense(
            hidden_policy,
            self.act_size[0],
            activation=None,
            kernel_initializer=LearningModel.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        self.log_sigma_sq = tf.get_variable(
            "log_sigma_squared",
            [self.act_size[0]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )

        sigma_sq = tf.exp(self.log_sigma_sq)

        self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                      dtype=tf.float32,
                                      name="epsilon")
        # Clip and scale output to ensure actions are always within [-1, 1] range.
        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
        self.output = tf.identity(output_post, name="action")
        self.selected_actions = tf.stop_gradient(output_post)

        # Compute probability of model output.
        all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) /
                     sigma_sq - 0.5 * tf.log(2.0 * np.pi) -
                     0.5 * self.log_sigma_sq)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        self.entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + self.log_sigma_sq)

        self.create_value_heads(self.stream_names, hidden_value)

        self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]],
                                                dtype=tf.float32,
                                                name="old_probabilities")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)
        self.old_log_probs = tf.reduce_sum(
            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
예제 #13
0
    def create_dc_actor_critic(self, h_size: int, num_layers: int,
                               vis_encode_type: EncoderType) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        """
        hidden_streams = self.create_observation_streams(
            1, h_size, num_layers, vis_encode_type)
        hidden = hidden_streams[0]

        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden = tf.concat([hidden, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden, memory_out = self.create_recurrent_encoder(
                hidden, self.memory_in, self.sequence_length)
            self.memory_out = tf.identity(memory_out, name="recurrent_out")

        policy_branches = []
        for size in self.act_size:
            policy_branches.append(
                tf.layers.dense(
                    hidden,
                    size,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=LearningModel.scaled_init(0.01),
                ))

        self.all_log_probs = tf.concat(policy_branches,
                                       axis=1,
                                       name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, _, normalized_logits = self.create_discrete_action_masking_layer(
            self.all_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.normalized_logits = tf.identity(normalized_logits, name="action")

        self.create_value_heads(self.stream_names, hidden)

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        self.all_old_log_probs = tf.placeholder(
            shape=[None, sum(self.act_size)],
            dtype=tf.float32,
            name="old_probabilities")
        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
            self.all_old_log_probs, self.action_masks, self.act_size)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
        self.old_log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=old_normalized_logits[:, action_idx[i]:
                                                     action_idx[i + 1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
예제 #14
0
    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        if self.use_recurrent:
            self.prev_action = tf.placeholder(shape=[None,
                                                     len(self.act_size)],
                                              dtype=tf.int32,
                                              name="prev_action")
            prev_action_oh = tf.concat(
                [
                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                hidden_policy,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy",
            )

            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
        else:
            hidden_policy = encoded

        policy_branches = []
        with tf.variable_scope("policy"):
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_policy,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=ModelUtils.scaled_init(0.01),
                    ))

        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")

        self.action_masks = tf.placeholder(shape=[None,
                                                  sum(self.act_size)],
                                           dtype=tf.float32,
                                           name="action_masks")
        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
            raw_log_probs, self.action_masks, self.act_size)

        self.output = tf.identity(output)
        self.all_log_probs = tf.identity(normalized_logits, name="action")

        self.action_holder = tf.placeholder(shape=[None,
                                                   len(policy_branches)],
                                            dtype=tf.int32,
                                            name="action_holder")
        self.action_oh = tf.concat(
            [
                tf.one_hot(self.action_holder[:, i], self.act_size[i])
                for i in range(len(self.act_size))
            ],
            axis=1,
        )
        self.selected_actions = tf.stop_gradient(self.action_oh)

        action_idx = [0] + list(np.cumsum(self.act_size))

        self.entropy = tf.reduce_sum(
            (tf.stack(
                [
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=tf.nn.softmax(
                            self.all_log_probs[:,
                                               action_idx[i]:action_idx[i +
                                                                        1]]),
                        logits=self.all_log_probs[:,
                                                  action_idx[i]:action_idx[i +
                                                                           1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
        )

        self.log_probs = tf.reduce_sum(
            (tf.stack(
                [
                    -tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self.action_oh[:,
                                              action_idx[i]:action_idx[i + 1]],
                        logits=normalized_logits[:,
                                                 action_idx[i]:action_idx[i +
                                                                          1]],
                    ) for i in range(len(self.act_size))
                ],
                axis=1,
            )),
            axis=1,
            keepdims=True,
        )
예제 #15
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=ModelUtils.scaled_init(0.01),
                reuse=tf.AUTO_REUSE,
            )

            # Policy-dependent log_sigma
            if condition_sigma_on_obs:
                log_sigma = tf.layers.dense(
                    hidden_policy,
                    self.act_size[0],
                    activation=None,
                    name="log_sigma",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            else:
                log_sigma = tf.get_variable(
                    "log_sigma",
                    [self.act_size[0]],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min,
                                         self.log_std_max)

            sigma = tf.exp(log_sigma)

            epsilon = tf.random_normal(tf.shape(mu))

            sampled_policy = mu + sigma * epsilon

            # Stop gradient if we're not doing the resampling trick
            if not reparameterize:
                sampled_policy_probs = tf.stop_gradient(sampled_policy)
            else:
                sampled_policy_probs = sampled_policy

            # Compute probability of model output.
            _gauss_pre = -0.5 * (
                ((sampled_policy_probs - mu) /
                 (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi))
            all_probs = _gauss_pre
            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

        if tanh_squash:
            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = sampled_policy
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + 2 * log_sigma)
        # Make entropy the right shape
        self.entropy = tf.ones_like(tf.reshape(mu[:, 0],
                                               [-1])) * single_dim_entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)

        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")