def add_phase(self, name, done, score, summary, steps, report_every=None, log_every=None, checkpoint_every=None, feed=None): """Add a phase to the loop protocol. If the model breaks long computation into multiple steps, the done tensor indicates whether the current score should be added to the mean counter. For example, in reinforcement learning we only have a valid score at the end of the episode. Score and done tensors can either be scalars or vectors, to support single and batched computations. Args: name: Name for the phase, used for the summary writer. done: Tensor indicating whether current score can be used. score: Tensor holding the current, possibly intermediate, score. summary: Tensor holding summary string to write if not an empty string. steps: Duration of the phase in steps. report_every: Yield mean score every this number of steps. log_every: Request summaries via `log` tensor every this number of steps. checkpoint_every: Write checkpoint every this number of steps. feed: Additional feed dictionary for the session run call. Raises: ValueError: Unknown rank for done or score tensors. """ done = tf.convert_to_tensor(done, tf.bool) score = tf.convert_to_tensor(score, tf.float32) summary = tf.convert_to_tensor(summary, tf.string) feed = feed or {} if done.shape.ndims is None or score.shape.ndims is None: raise ValueError("Rank of 'done' and 'score' tensors must be known.") writer = self._logdir and tf.summary.FileWriter( os.path.join(self._logdir, name), tf.get_default_graph(), flush_secs=60) op = self._define_step(done, score, summary) batch = 1 if score.shape.ndims == 0 else score.shape[0].value self._phases.append( _Phase(name, writer, op, batch, int(steps), feed, report_every, log_every, checkpoint_every))
def _network(self, observ, length=None, state=None, reuse=True): """Compute the network output for a batched sequence of observations. Optionally, the initial state can be specified. The weights should be reused for all calls, except for the first one. Output is a named tuple containing the policy as a TensorFlow distribution, the policy mean and log standard deviation, the approximated state value, and the new recurrent state. Args: observ: Sequences of observations. length: Batch of sequence lengths. state: Batch of initial recurrent states. reuse: Python boolean whether to reuse previous variables. Returns: NetworkOutput tuple. """ with tf.variable_scope('network', reuse=reuse): observ = tf.convert_to_tensor(observ) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): observ = tf.check_numerics(observ, 'observ') cell = self._config.network( self._batch_env.action.shape[1].value) (mean, logstd, value), state = tf.nn.dynamic_rnn(cell, observ, length, state, tf.float32, swap_memory=True) mean = tf.check_numerics(mean, 'mean') logstd = tf.check_numerics(logstd, 'logstd') value = tf.check_numerics(value, 'value') policy = tf.contrib.distributions.MultivariateNormalDiag( mean, tf.exp(logstd)) return _NetworkOutput(policy, mean, logstd, value, state)
def simulate(batch_env, algo, log=True, reset=False): """Simulation step of a vecrotized algorithm with in-graph environments. Integrates the operations implemented by the algorithm and the environments into a combined operation. Args: batch_env: In-graph batch environment. algo: Algorithm instance implementing required operations. log: Tensor indicating whether to compute and return summaries. reset: Tensor causing all environments to reset. Returns: Tuple of tensors containing done flags for the current episodes, possibly intermediate scores for the episodes, and a summary tensor. """ def _define_begin_episode(agent_indices): """Reset environments, intermediate scores and durations for new episodes. Args: agent_indices: Tensor containing batch indices starting an episode. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 zero_scores = tf.zeros_like(agent_indices, tf.float32) zero_durations = tf.zeros_like(agent_indices) reset_ops = [ batch_env.reset(agent_indices), tf.scatter_update(score, agent_indices, zero_scores), tf.scatter_update(length, agent_indices, zero_durations) ] with tf.control_dependencies(reset_ops): return algo.begin_episode(agent_indices) def _define_step(): """Request actions from the algorithm and apply them to the environments. Increments the lengths of all episodes and increases their scores by the current reward. After stepping the environments, provides the full transition tuple to the algorithm. Returns: Summary tensor. """ prevob = batch_env.observ + 0 # Ensure a copy of the variable value. action, step_summary = algo.perform(prevob) action.set_shape(batch_env.action.shape) with tf.control_dependencies([batch_env.simulate(action)]): add_score = score.assign_add(batch_env.reward) inc_length = length.assign_add(tf.ones(len(batch_env), tf.int32)) with tf.control_dependencies([add_score, inc_length]): experience_summary = algo.experience(prevob, batch_env.action, batch_env.reward, batch_env.done, batch_env.observ) return tf.summary.merge([step_summary, experience_summary]) def _define_end_episode(agent_indices): """Notify the algorithm of ending episodes. Also updates the mean score and length counters used for summaries. Args: agent_indices: Tensor holding batch indices that end their episodes. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 submit_score = mean_score.submit(tf.gather(score, agent_indices)) submit_length = mean_length.submit( tf.cast(tf.gather(length, agent_indices), tf.float32)) with tf.control_dependencies([submit_score, submit_length]): return algo.end_episode(agent_indices) def _define_summaries(): """Reset the average score and duration, and return them as summary. Returns: Summary string. """ score_summary = tf.cond( tf.logical_and(log, tf.cast(mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()), str) length_summary = tf.cond( tf.logical_and(log, tf.cast(mean_length.count, tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str) return tf.summary.merge([score_summary, length_summary]) with tf.name_scope('simulate'): log = tf.convert_to_tensor(log) reset = tf.convert_to_tensor(reset) with tf.variable_scope('simulate_temporary'): score = tf.Variable(tf.zeros(len(batch_env), dtype=tf.float32), False, name='score') length = tf.Variable(tf.zeros(len(batch_env), dtype=tf.int32), False, name='length') mean_score = streaming_mean.StreamingMean((), tf.float32) mean_length = streaming_mean.StreamingMean((), tf.float32) agent_indices = tf.cond( reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)) begin_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_begin_episode(agent_indices), str) with tf.control_dependencies([begin_episode]): step = _define_step() with tf.control_dependencies([step]): agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32) end_episode = tf.cond(tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_end_episode(agent_indices), str) with tf.control_dependencies([end_episode]): summary = tf.summary.merge( [_define_summaries(), begin_episode, step, end_episode]) with tf.control_dependencies([summary]): done, score = tf.identity(batch_env.done), tf.identity(score) return done, score, summary