def _define_experience(self, agent_indices, observ, action, reward): """Implement the branch of experience() entered during training.""" update_filters = tf.summary.merge([ self._observ_filter.update(observ), self._reward_filter.update(reward) ]) with tf.control_dependencies([update_filters]): if self._config.train_on_agent_action: # NOTE: Doesn't seem to change much. action = self._last_action batch = (observ, action, tf.gather(self._last_mean, agent_indices), tf.gather(self._last_logstd, agent_indices), reward) append = self._episodes.append(batch, agent_indices) with tf.control_dependencies([append]): norm_observ = self._observ_filter.transform(observ) norm_reward = tf.reduce_mean(self._reward_filter.transform(reward)) # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ update_filters, self._observ_filter.summary(), self._reward_filter.summary(), tf.summary.scalar('memory_size', self._memory_index), tf.summary.histogram('normalized_observ', norm_observ), tf.summary.histogram('action', self._last_action), tf.summary.scalar('normalized_reward', norm_reward) ]), str) return summary
def append(self, transitions, rows=None): """Append a batch of transitions to rows of the memory. Args: transitions: Tuple of transition quantities with batch dimension. rows: Episodes to append to, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less(tf.gather(self._length, rows), self._max_length, message='max length exceeded') append_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, transitions): timestep = tf.gather(self._length, rows) indices = tf.stack([rows, timestep], 1) append_ops.append( tf.scatter_nd_update(buffer_, indices, elements)) with tf.control_dependencies(append_ops): episode_mask = tf.reduce_sum( tf.one_hot(rows, self._capacity, dtype=tf.int32), 0) return self._length.assign_add(episode_mask)
def _define_end_episode(self, agent_indices): """Implement the branch of end_episode() entered during training.""" episodes, length = self._episodes.data(agent_indices) space_left = self._config.update_every - self._memory_index use_episodes = tf.range( tf.minimum(tf.shape(agent_indices)[0], space_left)) episodes = [tf.gather(elem, use_episodes) for elem in episodes] append = self._memory.replace(episodes, tf.gather(length, use_episodes), use_episodes + self._memory_index) with tf.control_dependencies([append]): inc_index = self._memory_index.assign_add( tf.shape(use_episodes)[0]) with tf.control_dependencies([inc_index]): memory_full = self._memory_index >= self._config.update_every return tf.cond(memory_full, self._training, str)
def _define_step(self, done, score, summary): """Combine operations of a phase. Keeps track of the mean score and when to report it. Args: done: Tensor indicating whether current score can be used. score: Tensor holding the current, possibly intermediate, score. summary: Tensor holding summary string to write if not an empty string. Returns: Tuple of summary tensor, mean score, and new global step. The mean score is zero for non reporting steps. """ if done.shape.ndims == 0: done = done[None] if score.shape.ndims == 0: score = score[None] score_mean = streaming_mean.StreamingMean((), tf.float32) with tf.control_dependencies([done, score, summary]): done_score = tf.gather(score, tf.where(done)[:, 0]) submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op) with tf.control_dependencies([submit_score]): mean_score = tf.cond(self._report, score_mean.clear, float) steps_made = tf.shape(score)[0] next_step = self._step.assign_add(steps_made) with tf.control_dependencies([mean_score, next_step]): return tf.identity(summary), mean_score, next_step, steps_made
def _define_end_episode(agent_indices): """Notify the algorithm of ending episodes. Also updates the mean score and length counters used for summaries. Args: agent_indices: Tensor holding batch indices that end their episodes. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 submit_score = mean_score.submit(tf.gather(score, agent_indices)) submit_length = mean_length.submit( tf.cast(tf.gather(length, agent_indices), tf.float32)) with tf.control_dependencies([submit_score, submit_length]): return algo.end_episode(agent_indices)
def data(self, rows=None): """Access a batch of episodes from the memory. Padding elements after the length of each episode are unspecified and might contain old data. Args: rows: Episodes to select, defaults to all. Returns: Tuple containing a tuple of transition quantiries with batch and time dimensions, and a batch of sequence lengths. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 episode = [tf.gather(buffer_, rows) for buffer_ in self._buffers] length = tf.gather(self._length, rows) return episode, length
def length(self, rows=None): """Tensor holding the current length of episodes. Args: rows: Episodes to select length from, defaults to all. Returns: Batch tensor of sequence lengths. """ rows = tf.range(self._capacity) if rows is None else rows return tf.gather(self._length, rows)
def perform(self, agent_indices, observ): """Compute batch of actions and a summary for a batch of observation. Args: agent_indices: Tensor containing current batch indices. observ: Tensor of a batch of observations for all agents. Returns: Tuple of action batch tensor and summary tensor. """ with tf.name_scope('perform/'): observ = self._observ_filter.transform(observ) if self._last_state is None: state = None else: state = tf.contrib.framework.nest.map_structure( lambda x: tf.gather(x, agent_indices), self._last_state) output = self._network(observ[:, None], tf.ones(observ.shape[0]), state) action = tf.cond(self._is_training, output.policy.sample, lambda: output.mean) logprob = output.policy.log_prob(action)[:, 0] # pylint: disable=g-long-lambda summary = tf.cond( self._should_log, lambda: tf.summary.merge([ tf.summary.histogram('mean', output.mean[:, 0]), tf.summary.histogram('std', tf.exp(output.logstd[:, 0])), tf.summary.histogram('action', action[:, 0]), tf.summary.histogram('logprob', logprob) ]), str) # Remember current policy to append to memory in the experience callback. if self._last_state is None: assign_state = tf.no_op() else: assign_state = utility.assign_nested_vars( self._last_state, output.state, agent_indices) with tf.control_dependencies([ assign_state, tf.scatter_update(self._last_action, agent_indices, action[:, 0]), tf.scatter_update(self._last_mean, agent_indices, output.mean[:, 0]), tf.scatter_update(self._last_logstd, agent_indices, output.logstd[:, 0]) ]): return tf.check_numerics(action[:, 0], 'action'), tf.identity(summary)