예제 #1
0
파일: in_graph_env.py 프로젝트: Gs-001/quad
    def simulate(self, action):
        """Step the environment.

    The result of the step can be accessed from the variables defined below.

    Args:
      action: Tensor holding the action to apply.

    Returns:
      Operation.
    """
        with tf.name_scope('environment/simulate'):
            if action.dtype in (tf.float16, tf.float32, tf.float64):
                action = tf.check_numerics(action, 'action')
            observ_dtype = self._parse_dtype(self._env.observation_space)
            observ, reward, done = tf.py_func(
                lambda a: self._env.step(a)[:3], [action],
                [observ_dtype, tf.float32, tf.bool],
                name='step')
            observ = tf.check_numerics(observ, 'observ')
            reward = tf.check_numerics(reward, 'reward')
            return tf.group(self._observ.assign(observ),
                            self._action.assign(action),
                            self._reward.assign(reward),
                            self._done.assign(done), self._step.assign_add(1))
    def _build_resource_tf(self):
        self.count_tf = tf.get_variable(dtype=tf.int32,
                                        name='count',
                                        initializer=np.array([self.count],
                                                             dtype=np.int32),
                                        trainable=False)
        self.mean_tf = tf.get_variable(dtype=tf.float32,
                                       name='mean',
                                       initializer=self.mean.astype(
                                           np.float32),
                                       trainable=False)
        self.std_tf = tf.get_variable(dtype=tf.float32,
                                      name='std',
                                      initializer=self.std.astype(np.float32),
                                      trainable=False)

        self.count_ph = tf.get_variable(dtype=tf.int32,
                                        name='count_ph',
                                        shape=[1])
        self.mean_ph = tf.get_variable(dtype=tf.float32,
                                       name='mean_ph',
                                       shape=self.mean.shape)
        self.std_ph = tf.get_variable(dtype=tf.float32,
                                      name='std_ph',
                                      shape=self.std.shape)

        self._update_op = tf.group(self.count_tf.assign(self.count_ph),
                                   self.mean_tf.assign(self.mean_ph),
                                   self.std_tf.assign(self.std_ph))
        return
예제 #3
0
 def submit(self, value):
     """Submit a single or batch tensor to refine the streaming mean."""
     # Add a batch dimension if necessary.
     if value.shape.ndims == self._sum.shape.ndims:
         value = value[None, ...]
     return tf.group(self._sum.assign_add(tf.reduce_sum(value, 0)),
                     self._count.assign_add(tf.shape(value)[0]))
    def _training(self):
        """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
        with tf.name_scope('training'):
            assert_full = tf.assert_equal(self._memory_index,
                                          self._config.update_every)
            with tf.control_dependencies([assert_full]):
                data = self._memory.data()
            (observ, action, old_mean, old_logstd, reward), length = data
            with tf.control_dependencies([tf.assert_greater(length, 0)]):
                length = tf.identity(length)
            observ = self._observ_filter.transform(observ)
            reward = self._reward_filter.transform(reward)
            update_summary = self._perform_update_steps(
                observ, action, old_mean, old_logstd, reward, length)
            with tf.control_dependencies([update_summary]):
                penalty_summary = self._adjust_penalty(observ, old_mean,
                                                       old_logstd, length)
            with tf.control_dependencies([penalty_summary]):
                clear_memory = tf.group(self._memory.clear(),
                                        self._memory_index.assign(0))
            with tf.control_dependencies([clear_memory]):
                weight_summary = utility.variable_summaries(
                    tf.trainable_variables(), self._config.weight_summaries)
                return tf.summary.merge(
                    [update_summary, penalty_summary, weight_summary])
예제 #5
0
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None):
    """Initialize or restore variables from a checkpoint if available.

  Args:
    sess: Session to initialize variables in.
    saver: Saver to restore variables.
    logdir: Directory to search for checkpoints.
    checkpoint: Specify what checkpoint name to use; defaults to most recent.
    resume: Whether to expect recovering a checkpoint or starting a new run.

  Raises:
    ValueError: If resume expected but no log directory specified.
    RuntimeError: If no resume expected but a checkpoint was found.
  """
    sess.run(
        tf.group(tf.local_variables_initializer(),
                 tf.global_variables_initializer()))
    if resume and not (logdir or checkpoint):
        raise ValueError('Need to specify logdir to resume a checkpoint.')
    if logdir:
        state = tf.train.get_checkpoint_state(logdir)
        if checkpoint:
            checkpoint = os.path.join(logdir, checkpoint)
        if not checkpoint and state and state.model_checkpoint_path:
            checkpoint = state.model_checkpoint_path
        if checkpoint and resume is False:
            message = 'Found unexpected checkpoint when starting a new run.'
            raise RuntimeError(message)
        if checkpoint:
            saver.restore(sess, checkpoint)
예제 #6
0
파일: normalize.py 프로젝트: Gs-001/quad
    def reset(self):
        """Reset the estimates of mean and variance.

    Resets the full state of this class.

    Returns:
      Operation.
    """
        with tf.name_scope(self._name + '/reset'):
            return tf.group(self._count.assign(0),
                            self._mean.assign(tf.zeros_like(self._mean)),
                            self._var_sum.assign(tf.zeros_like(self._var_sum)))
예제 #7
0
def assign_nested_vars(variables, tensors):
  """Assign tensors to matching nested tuple of variables.

  Args:
    variables: Nested tuple or list of variables to update.
    tensors: Nested tuple or list of tensors to assign.

  Returns:
    Operation.
  """
  if isinstance(variables, (tuple, list)):
    return tf.group(
        *[assign_nested_vars(variable, tensor) for variable, tensor in zip(variables, tensors)])
  return variables.assign(tensors)
    def _perform_update_steps(self, observ, action, old_mean, old_logstd,
                              reward, length):
        """Perform multiple update steps of value function and policy.

    The advantage is computed once at the beginning and shared across
    iterations. We need to decide for the summary of one iteration, and thus
    choose the one after half of the iterations.

    Args:
      observ: Sequences of observations.
      action: Sequences of actions.
      old_mean: Sequences of action means of the behavioral policy.
      old_logstd: Sequences of action log stddevs of the behavioral policy.
      reward: Sequences of rewards.
      length: Batch of sequence lengths.

    Returns:
      Summary tensor.
    """
        return_ = utility.discounted_return(reward, length,
                                            self._config.discount)
        value = self._network(observ, length).value
        if self._config.gae_lambda:
            advantage = utility.lambda_return(reward, value, length,
                                              self._config.discount,
                                              self._config.gae_lambda)
        else:
            advantage = return_ - value
        mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True)
        advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8)
        advantage = tf.Print(advantage,
                             [tf.reduce_mean(return_),
                              tf.reduce_mean(value)], 'return and value: ')
        advantage = tf.Print(advantage, [tf.reduce_mean(advantage)],
                             'normalized advantage: ')
        # pylint: disable=g-long-lambda
        value_loss, policy_loss, summary = tf.scan(
            lambda _1, _2: self._update_step(
                observ, action, old_mean, old_logstd, reward, advantage, length
            ),
            tf.range(self._config.update_epochs), [0., 0., ''],
            parallel_iterations=1)
        print_losses = tf.group(
            tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
            tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '))
        with tf.control_dependencies([value_loss, policy_loss, print_losses]):
            return summary[self._config.update_epochs // 2]
예제 #9
0
def reinit_nested_vars(variables, indices=None):
  """Reset all variables in a nested tuple to zeros.

  Args:
    variables: Nested tuple or list of variaables.
    indices: Indices along the first dimension to reset, defaults to all.

  Returns:
    Operation.
  """
  if isinstance(variables, (tuple, list)):
    return tf.group(*[reinit_nested_vars(variable, indices) for variable in variables])
  if indices is None:
    return variables.assign(tf.zeros_like(variables))
  else:
    zeros = tf.zeros([tf.shape(indices)[0]] + variables.shape[1:].as_list())
    return tf.scatter_update(variables, indices, zeros)
예제 #10
0
def assign_nested_vars(variables, tensors, indices=None):
    """Assign tensors to matching nested tuple of variables.

  Args:
    variables: Nested tuple or list of variables to update.
    tensors: Nested tuple or list of tensors to assign.
    indices: Batch indices to assign to; default to all.

  Returns:
    Operation.
  """
    if isinstance(variables, (tuple, list)):
        return tf.group(*[
            assign_nested_vars(variable, tensor)
            for variable, tensor in zip(variables, tensors)
        ])
    if indices is None:
        return variables.assign(tensors)
    else:
        return tf.scatter_update(variables, indices, tensors)
    def __init__(self, sess, var_list, dtype=tf.float32):
        assigns = []
        shapes = list(map(var_shape, var_list))
        total_size = np.sum([intprod(shape) for shape in shapes])

        self.sess = sess
        self.theta = tf.placeholder(dtype, [total_size])
        start = 0
        assigns = []

        for (shape, v) in zip(shapes, var_list):
            size = intprod(shape)
            assigns.append(
                tf.assign(v, tf.reshape(self.theta[start:start + size],
                                        shape)))
            start += size

        self.op = tf.group(*assigns)

        return