Exemplo n.º 1
0
class AdversarialEnvironmentScalar(tf_metric.TFStepMetric):
    """Metric to compute average of simple scalars like number of obstacles."""
    def __init__(self,
                 name,
                 prefix='Metrics',
                 dtype=tf.float32,
                 batch_size=1,
                 buffer_size=10):
        super(AdversarialEnvironmentScalar, self).__init__(name=name,
                                                           prefix=prefix)
        self._buffer = TFDeque(buffer_size, dtype)
        self._dtype = dtype

    @common.function(autograph=True)
    def call(self, new_scalar_vals):
        for v in new_scalar_vals:
            self._buffer.add(v)
        return new_scalar_vals

    def result(self):
        return self._buffer.mean()

    @common.function
    def reset(self):
        self._buffer.clear()
Exemplo n.º 2
0
class StateVisitationHistogram(tf_metric.TFHistogramStepMetric):
    """Metric to compute the frequency of states visited."""
    def __init__(self,
                 state_selection_function,
                 state_shape=(),
                 name='StateVisitationHistogram',
                 dtype=tf.float64,
                 buffer_size=100):
        super(StateVisitationHistogram, self).__init__(name=name)
        self._buffer = TFDeque(buffer_size, dtype, shape=state_shape)
        self._dtype = dtype
        self._state_selection_function = state_selection_function

    @common.function
    def call(self, trajectory):
        self._buffer.extend(
            self._state_selection_function(trajectory.observation))
        return trajectory

    @common.function
    def result(self):
        return self._buffer.data

    @common.function
    def reset(self):
        self._buffer.clear()
Exemplo n.º 3
0
class RewardHistogram(tf_metric.TFHistogramStepMetric):
    """Metric to compute the frequency of rewards."""
    def __init__(self,
                 name='RewardHistogram',
                 dtype=tf.int32,
                 buffer_size=100):
        super(RewardHistogram, self).__init__(name=name)
        self._buffer = TFDeque(buffer_size, dtype)
        self._dtype = dtype

    @common.function
    def call(self, trajectory):
        self._buffer.extend(trajectory.reward)
        return trajectory

    @common.function
    def result(self):
        return self._buffer.data

    @common.function
    def reset(self):
        self._buffer.clear()
class AverageReturnMetric(tf_metric.TFStepMetric):
    """Metric for the average collective return and individual agent returns."""
    def __init__(self,
                 n_agents,
                 name='MultiagentAverageReturn',
                 prefix='Metrics',
                 dtype=tf.float32,
                 batch_size=1,
                 buffer_size=10):
        super(AverageReturnMetric, self).__init__(name=name, prefix=prefix)
        self.n_agents = n_agents
        self._dtype = dtype

        # Accumulator and buffer for the average return of all agents
        self._collective_return_accumulator = common.create_variable(
            initial_value=0,
            dtype=dtype,
            shape=(batch_size, ),
            name='Accumulator')
        self._collective_buffer = TFDeque(buffer_size, dtype)

        # Accumulators for each agent's independent reward
        self._agent_return_accumulators = []
        for a in range(n_agents):
            self._agent_return_accumulators.append(
                common.create_variable(initial_value=0,
                                       dtype=dtype,
                                       shape=(batch_size, ),
                                       name='Accumulator' + str(a)))

        # Buffers for each agent's independent reward
        self._agent_buffers = []
        for a in range(n_agents):
            self._agent_buffers.append(TFDeque(buffer_size, dtype))

    @common.function(autograph=True)
    def call(self, trajectory):
        # Zero out batch indices where a new episode is starting.
        self._collective_return_accumulator.assign(
            zero_out_new_episodes(trajectory,
                                  self._collective_return_accumulator))
        for a in range(self.n_agents):
            self._agent_return_accumulators[a].assign(
                zero_out_new_episodes(trajectory,
                                      self._agent_return_accumulators[a]))

        # Note that trajectory.reward has shape (batch, n_agents)

        # Update accumulator with sum of received rewards.
        self._collective_return_accumulator.assign_add(
            tf.reduce_mean(trajectory.reward, axis=1))

        # Pull out data for each agent and assign
        for a in range(self.n_agents):
            self._agent_return_accumulators[a].assign_add(trajectory.reward[:,
                                                                            a])

        # Add final returns to buffer.
        last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()),
                                          axis=-1)
        for indx in last_episode_indices:
            self._collective_buffer.add(
                self._collective_return_accumulator[indx])

            # Agent buffers that use the global done
            for a in range(self.n_agents):
                self._agent_buffers[a].add(
                    self._agent_return_accumulators[a][indx])

        return trajectory

    def result(self):
        return self._collective_buffer.mean()

    def result_for_agent(self, agent_id):
        return self._agent_buffers[agent_id].mean()

    @common.function
    def reset(self):
        self._collective_buffer.clear()
        self._collective_return_accumulator.assign(
            tf.zeros_like(self._collective_return_accumulator))

        for a in range(self.n_agents):
            self._agent_buffers[a].clear()
            self._agent_return_accumulators[a].assign(
                tf.zeros_like(self._agent_return_accumulators[a]))

    def tf_summaries(self, train_step=None, step_metrics=()):
        """Generates summaries for all agents & collective summary against steps.

    Args:
      train_step: (Optional) Step counter for training iterations. If None, no
        metric is generated against the global step.
      step_metrics: (Optional) Iterable of step metrics to generate summaries
        against.

    Returns:
      A list of summaries.
    """
        summaries = super(AverageReturnMetric,
                          self).tf_summaries(train_step=train_step,
                                             step_metrics=step_metrics)

        for a in range(self.n_agents):
            summaries.extend(
                self.single_agent_summary(a, train_step, step_metrics))

        return summaries

    def single_agent_summary(self, agent_id, train_step=None, step_metrics=()):
        summaries = []
        prefix = self._prefix
        name = self.name + '_agent' + str(agent_id)
        tag = common.join_scope(prefix, name)

        result = self.result_for_agent(agent_id)

        if train_step is not None:
            summaries.append(
                tf.compat.v2.summary.scalar(name=tag,
                                            data=result,
                                            step=train_step))
        if prefix:
            prefix += '_'
        for step_metric in step_metrics:
            # Skip plotting the metrics against itself.
            if self.name == step_metric.name:
                continue
            step_tag = '{}vs_{}/{}'.format(prefix, step_metric.name, name)
            # Summaries expect the step value to be an int64.
            step = tf.cast(step_metric.result(), tf.int64)
            summaries.append(
                tf.compat.v2.summary.scalar(name=step_tag,
                                            data=result,
                                            step=step))
        return summaries
Exemplo n.º 5
0
class ActionProbabilityMetric(TFStepMetric):
    """
    A metric that records the average action probabilities over a given period.
    Implementation similar to tf_agent.metrics.tf_metrics.AverageReturnMetric
    """

    def __init__(self,
                 policy: tf_policy.TFPolicy,
                 action_indices: Tuple[int, ...],
                 name: str = 'ActionProbability',
                 prefix: str = 'Metrics',
                 dtype: Type = tf.float32,
                 batch_size: int = 1,
                 buffer_size: int = 10):
        """
        :param policy: Policy of the agent used for reevaluation to attain action probabilities at
            each time step.
        :param action_indices: A tuple of indices of the action probability vector to track. This is
            a tuple to allow for the case where the action is a tuple of tensors.
        :param name: Name of the metric (as it will appear in tensorboard).
        :param prefix: Prefix to apply as part of the naming convention.
        :param dtype: Data type of the metric.
        :param batch_size: Batch size of the RL environment.
        :param buffer_size: The capacity of the buffer which will rewrite itself when full but is
            emptied at every logging point.
        """
        super().__init__(name=name, prefix=prefix)
        self._action_indices = action_indices
        self._dtype = dtype
        self._probability_accumulator = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='Accumulator'
        )
        self._policy = policy
        self._buffer = TFDeque(buffer_size, dtype)
        self._count_accumulator = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='CountAccumulator'
        )

    @common.function(autograph=True)
    def call(self, trajectory: Trajectory) -> Trajectory:
        time_step = TimeStep(trajectory.step_type, trajectory.reward, trajectory.discount,
                             trajectory.observation)
        action_dist = self._policy.distribution(time_step).action

        # If the action distribution is in fact a tuple of distributions (one for each resource set)
        # then we need to index into them to attain the underlying distribution which can then be
        # used to attain probabilities. This is only the case where there are multiple resource
        # sets.
        for i in self._action_indices[:-1]:
            action_dist = action_dist[i]

        action_probs = action_dist.probs_parameter()
        # Zero out batch indices where a new episode is starting.
        self._probability_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._probability_accumulator),
                     self._probability_accumulator))
        self._count_accumulator.assign(
            tf.where(trajectory.is_first(), tf.zeros_like(self._count_accumulator),
                     self._count_accumulator))
        # Update accumulators with probability and count increments.
        self._probability_accumulator.assign_add(action_probs[..., 0, self._action_indices[-1]])
        self._count_accumulator.assign_add(tf.ones_like(self._count_accumulator))

        # Add final cumulants to buffer at the end of episodes.
        last_episode_indices = tf.squeeze(tf.where(trajectory.is_last()), axis=-1)
        for idx in last_episode_indices:
            self._buffer.add(self._probability_accumulator[idx] / self._count_accumulator[idx])

        return trajectory

    def result(self) -> tf.Tensor:
        """Return the metric value."""
        return self._buffer.mean()

    @common.function
    def reset(self) -> None:
        """Clear the buffer and reset the accumulators."""
        self._buffer.clear()
        self._probability_accumulator.assign(tf.zeros_like(self._probability_accumulator))
        self._count_accumulator.assign(tf.zeros_like(self._count_accumulator))

    @property
    def action_indices(self) -> Tuple[int, ...]:
        return self._action_indices
Exemplo n.º 6
0
class EpisodicConditionalActivityTracker1D(TFStepMetric):

    def __init__(self,
                 filter_condition: Callable,
                 activity_condition: Callable,
                 name: str,
                 prefix: str = "Metrics",
                 dtype: type = tf.int32,
                 batch_size: int = 1,
                 buffer_size: int = 10):
        """
        Custom TensorFlow metric which tracks activity in states which satisfy a certain condition.
        Used for example to see the behaviour of an agent when the buffers reach a certain level.

        :param filter_condition: The condition which determines which time steps to use for
            calculation. This defines the denominator.
        :param activity_condition: The condition which determines when the tracked activity occurs.
            This defined the numerator.
        :param name: Name of the metric as will be shown in TensorBoard.
        :param prefix: The name of the logging group to which this metric belongs.
        :param dtype: The data type of the metric and the components used for its calculation.
            Data type must be compatible with tf.reduce_sum (i.e. float or int).
        :param batch_size: The batch size of the environment.
        :param buffer_size: The length of the buffer used to store historical values of the metric.
        """
        # Initialise according to the parent class then add the inputs as attributes.
        super(EpisodicConditionalActivityTracker1D, self).__init__(name=name, prefix=prefix)
        self.filter_condition = filter_condition
        self.activity_condition = activity_condition
        self._dtype = dtype
        self._batch_size = batch_size
        # Build variables and storage to be used for storing and calculating the metrics.
        self._activity_buffer = TFDeque(buffer_size, dtype)
        self._qualifying_timesteps_buffer = TFDeque(buffer_size, dtype)
        # The activity accumulator becomes the numerator of the calculated rate/proportion.
        self._activity_accumulator = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='ActivityAccumulator')
        # The number of valid time steps becomes the denominator of the calculated rate/proportion.
        self._num_valid_timesteps = common.create_variable(
            initial_value=0, dtype=dtype, shape=(batch_size,), name='EpLenAccumulator')

    @common.function(autograph=True)
    def call(self, trajectory: Trajectory) -> Trajectory:
        """
        Process the experience passed in to update the metric value (or the components required to
        calculate the final value).

        :param trajectory: Experience from the agent rolling out in the environment.
        :return: The unchanged input trajectory (as per the standard use of TensorFlow Metrics).
        """
        start_of_episode_indices = tf.squeeze(tf.where(trajectory.is_first()), axis=-1)
        mask = tf.ones(shape=(self._batch_size,), dtype=self._dtype)

        for idx in start_of_episode_indices:
            mask -= tf.eye(self._batch_size)[idx]

        # Reset the accumulators at the end of each episode.
        self._num_valid_timesteps.assign(self._num_valid_timesteps * mask)
        self._activity_accumulator.assign(self._activity_accumulator * mask)

        # Find the number of time steps satisfying the filter condition.
        # The reshape is to ensure compatibility with the variable below in the case of no batch
        # dimension.
        valid_timesteps = tf.reshape(
            tf.reduce_sum(
                tf.cast(self.filter_condition(trajectory), self._dtype),
                axis=-1),
            self._num_valid_timesteps.shape)

        # Track the number of time steps which meet the qualifying condition.
        self._num_valid_timesteps.assign_add(valid_timesteps, name="increment_valid_timesteps")

        # Update accumulator with activity counts where both the filtering and activity condition
        # are satisfied. Again the reshape is to ensure compatibility with the accumulator
        # variable in the case where there is no batch dimension.
        bool_values = tf.logical_and(self.filter_condition(trajectory),
                                     self.activity_condition(trajectory))
        to_add = tf.reshape(
            tf.reduce_sum(tf.cast(bool_values, self._dtype), axis=-1),
            self._activity_accumulator.shape)

        self._activity_accumulator.assign_add(to_add)

        # Add values to buffer at the end of the episode by first finding where the trajectories end
        # and then using the resulting indices to update the correct buffer locations.
        # At the same time build up a mask of values to use for resetting the accumulators.
        end_of_episode_indices = tf.squeeze(tf.where(trajectory.step_type == 2), axis=-1)

        for idx in end_of_episode_indices:
            self._activity_buffer.add(self._activity_accumulator[idx])
            self._qualifying_timesteps_buffer.add(self._num_valid_timesteps[idx])

        # Return the original trajectory data as is standard for TFStepMetrics.
        return trajectory

    def result(self) -> tf.Tensor:
        """
        Calculate the value of the metric from stored components.
        :return: The calculated metric value.
        """
        return tf.reduce_mean(self._activity_buffer.data / self._qualifying_timesteps_buffer.data)

    @common.function
    def reset(self) -> None:
        """Reset the metric calculation components."""
        pass
        self._activity_buffer.clear()
        self._qualifying_timesteps_buffer.clear()
        self._activity_accumulator.assign(tf.zeros_like(self._activity_accumulator))
        self._num_valid_timesteps.assign(tf.zeros_like(self._num_valid_timesteps))