예제 #1
0
    def create_normalizer_update(
        vector_input: tf.Tensor,
        steps: tf.Tensor,
        running_mean: tf.Tensor,
        running_variance: tf.Tensor,
    ) -> tf.Operation:
        """
        Creates the update operation for the normalizer.
        :param vector_input: Vector observation to use for updating the running mean and variance.
        :param running_mean: Tensorflow tensor representing the current running mean.
        :param running_variance: Tensorflow tensor representing the current running variance.
        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
        :return: A TF operation that updates the normalization based on vector_input.
        """
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, running_mean)
        new_mean = running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(running_mean, new_mean)
        update_variance = tf.assign(running_variance, new_variance)
        update_norm_step = tf.assign(steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
예제 #2
0
 def __init__(
     self, m_size, normalize, use_recurrent, brain, seed, stream_names=None
 ):
     tf.set_random_seed(seed)
     self.brain = brain
     self.vector_in = None
     self.global_step, self.increment_step, self.steps_to_increment = (
         self.create_global_steps()
     )
     self.visual_in = []
     self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size")
     self.sequence_length = tf.placeholder(
         shape=None, dtype=tf.int32, name="sequence_length"
     )
     self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks")
     self.mask = tf.cast(self.mask_input, tf.int32)
     self.stream_names = stream_names or []
     self.use_recurrent = use_recurrent
     if self.use_recurrent:
         self.m_size = m_size
     else:
         self.m_size = 0
     self.normalize = normalize
     self.act_size = brain.vector_action_space_size
     self.vec_obs_size = brain.vector_observation_space_size
     self.vis_obs_size = brain.number_visual_observations
     tf.Variable(
         int(brain.vector_action_space_type == "continuous"),
         name="is_continuous_control",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(
         self._version_number_,
         name="version_number",
         trainable=False,
         dtype=tf.int32,
     )
     tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
     if brain.vector_action_space_type == "continuous":
         tf.Variable(
             self.act_size[0],
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     else:
         tf.Variable(
             sum(self.act_size),
             name="action_output_shape",
             trainable=False,
             dtype=tf.int32,
         )
     self.value_heads: Dict[str, tf.Tensor] = {}
     self.normalization_steps: Optional[tf.Variable] = None
     self.running_mean: Optional[tf.Variable] = None
     self.running_variance: Optional[tf.Variable] = None
     self.update_normalization: Optional[tf.Operation] = None
     self.value: Optional[tf.Tensor] = None
예제 #3
0
 def normalize_vector_obs(self, vector_obs):
     normalized_state = tf.clip_by_value(
         (vector_obs - self.running_mean) /
         tf.sqrt(self.running_variance /
                 (tf.cast(self.normalization_steps, tf.float32) + 1)),
         -5,
         5,
         name="normalized_state",
     )
     return normalized_state
예제 #4
0
 def create_normalizer_update(self, vector_input):
     mean_current_observation = tf.reduce_mean(vector_input, axis=0)
     new_mean = self.running_mean + (
         mean_current_observation - self.running_mean) / tf.cast(
             tf.add(self.normalization_steps, 1), tf.float32)
     new_variance = self.running_variance + (
         mean_current_observation - new_mean) * (mean_current_observation -
                                                 self.running_mean)
     update_mean = tf.assign(self.running_mean, new_mean)
     update_variance = tf.assign(self.running_variance, new_variance)
     update_norm_step = tf.assign(self.normalization_steps,
                                  self.normalization_steps + 1)
     return tf.group([update_mean, update_variance, update_norm_step])
예제 #5
0
    def create_normalizer_update(self, vector_input):
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(self.normalization_steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
        new_mean = self.running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = self.running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(self.running_mean, new_mean)
        update_variance = tf.assign(self.running_variance, new_variance)
        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
예제 #6
0
 def normalize_vector_obs(
     vector_obs: tf.Tensor,
     running_mean: tf.Tensor,
     running_variance: tf.Tensor,
     normalization_steps: tf.Tensor,
 ) -> tf.Tensor:
     """
     Create a normalized version of an input tensor.
     :param vector_obs: Input vector observation tensor.
     :param running_mean: Tensorflow tensor representing the current running mean.
     :param running_variance: Tensorflow tensor representing the current running variance.
     :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps.
     :return: A normalized version of vector_obs.
     """
     normalized_state = tf.clip_by_value(
         (vector_obs - running_mean) /
         tf.sqrt(running_variance /
                 (tf.cast(normalization_steps, tf.float32) + 1)),
         -5,
         5,
         name="normalized_state",
     )
     return normalized_state
예제 #7
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
                self.behavior_spec.observation_shapes)
            if self.normalize:
                self.first_normalization_update = True
                normalization_tensors = ModelUtils.create_normalizer(
                    self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.init_normalization_op = normalization_tensors.init_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(shape=None,
                                                dtype=tf.int32,
                                                name="batch_size")
            self.sequence_length_ph = tf.placeholder(shape=None,
                                                     dtype=tf.int32,
                                                     name="sequence_length")
            self.mask_input = tf.placeholder(shape=[None],
                                             dtype=tf.float32,
                                             name="masks")
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(shape=[None, self.act_size[0]],
                                          dtype=tf.float32,
                                          name="epsilon")
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.behavior_spec.is_action_continuous()),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            int_version = TFPolicy._convert_version_string(__version__)
            major_ver_t = tf.Variable(
                int_version[0],
                name="trainer_major_version",
                trainable=False,
                dtype=tf.int32,
            )
            minor_ver_t = tf.Variable(
                int_version[1],
                name="trainer_minor_version",
                trainable=False,
                dtype=tf.int32,
            )
            patch_ver_t = tf.Variable(
                int_version[2],
                name="trainer_patch_version",
                trainable=False,
                dtype=tf.int32,
            )
            self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t)
            tf.Variable(
                MODEL_FORMAT_VERSION,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(self.m_size,
                        name="memory_size",
                        trainable=False,
                        dtype=tf.int32)
            if self.behavior_spec.is_action_continuous():
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
예제 #8
0
    def create_input_placeholders(self):
        with self.graph.as_default():
            (
                self.global_step,
                self.increment_step_op,
                self.steps_to_increment,
            ) = ModelUtils.create_global_steps()
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                self.brain.camera_resolutions
            )
            self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size)
            if self.normalize:
                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
                self.update_normalization_op = normalization_tensors.update_op
                self.normalization_steps = normalization_tensors.steps
                self.running_mean = normalization_tensors.running_mean
                self.running_variance = normalization_tensors.running_variance
                self.processed_vector_in = ModelUtils.normalize_vector_obs(
                    self.vector_in,
                    self.running_mean,
                    self.running_variance,
                    self.normalization_steps,
                )
            else:
                self.processed_vector_in = self.vector_in
                self.update_normalization_op = None

            self.batch_size_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="batch_size"
            )
            self.sequence_length_ph = tf.placeholder(
                shape=None, dtype=tf.int32, name="sequence_length"
            )
            self.mask_input = tf.placeholder(
                shape=[None], dtype=tf.float32, name="masks"
            )
            # Only needed for PPO, but needed for BC module
            self.epsilon = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
            )
            self.mask = tf.cast(self.mask_input, tf.int32)

            tf.Variable(
                int(self.brain.vector_action_space_type == "continuous"),
                name="is_continuous_control",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self._version_number_,
                name="version_number",
                trainable=False,
                dtype=tf.int32,
            )
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            if self.brain.vector_action_space_type == "continuous":
                tf.Variable(
                    self.act_size[0],
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
            else:
                tf.Variable(
                    sum(self.act_size),
                    name="action_output_shape",
                    trainable=False,
                    dtype=tf.int32,
                )
예제 #9
0
    def __init__(
        self,
        brain,
        h_size=128,
        lr=1e-4,
        n_layers=2,
        m_size=128,
        normalize=False,
        use_recurrent=False,
        seed=0,
    ):
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed)
        num_streams = 1
        hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
        hidden = hidden_streams[0]
        self.dropout_rate = tf.placeholder(
            dtype=tf.float32, shape=[], name="dropout_rate"
        )
        hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
        if self.use_recurrent:
            tf.Variable(
                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
            )
            self.memory_in = tf.placeholder(
                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
            )
            hidden_reg, self.memory_out = self.create_recurrent_encoder(
                hidden_reg, self.memory_in, self.sequence_length
            )
            self.memory_out = tf.identity(self.memory_out, name="recurrent_out")

        if brain.vector_action_space_type == "discrete":
            policy_branches = []
            for size in self.act_size:
                policy_branches.append(
                    tf.layers.dense(
                        hidden_reg,
                        size,
                        activation=None,
                        use_bias=False,
                        kernel_initializer=tf.initializers.variance_scaling(0.01),
                    )
                )
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches],
                axis=1,
                name="action_probs",
            )
            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
                tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
            )
            tf.identity(normalized_logits, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(
                shape=[None, len(policy_branches)],
                dtype=tf.int32,
                name="teacher_action",
            )
            self.action_oh = tf.concat(
                [
                    tf.one_hot(self.true_action[:, i], self.act_size[i])
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            self.loss = tf.reduce_sum(
                -tf.log(self.action_probs + 1e-10) * self.action_oh
            )
            self.action_percent = tf.reduce_mean(
                tf.cast(
                    tf.equal(
                        tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32),
                        self.sample_action,
                    ),
                    tf.float32,
                )
            )
        else:
            self.policy = tf.layers.dense(
                hidden_reg,
                self.act_size[0],
                activation=None,
                use_bias=False,
                name="pre_action",
                kernel_initializer=tf.initializers.variance_scaling(0.01),
            )
            self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
            self.sample_action = tf.identity(self.clipped_sample_action, name="action")
            self.true_action = tf.placeholder(
                shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action"
            )
            self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
            self.loss = tf.reduce_sum(
                tf.squared_difference(self.clipped_true_action, self.sample_action)
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)