示例#1
0
def estimate_value_from_state_ratios(
    data,
    target_policy,
    gamma,
    state_ratio_fn):
  """Estimates value of policy given data and state density ratios.

  Args:
    data: The experience data to base the estimate on.
    target_policy: The policy whose value to estimate.
    gamma: Discount to use in the value calculation.
    state_ratio_fn: A function taking in batches of states and returning
      estimates of the ratio d^pi(s) / d^D(s), where d^pi(s) is the discounted
      occupancy of the target policy at state s and d^D(s) is the probability
      with which state s appears in the experience data.

  Returns:
    Estimated average per-step reward of the target policy.
  """
  all_data = data.get_all()
  state_density_ratio = state_ratio_fn(all_data.state)
  policy_ratio = policy_lib.get_policy_ratio(
      data.policy, target_policy,
      all_data.state, all_data.action)
  state_action_density_ratio = state_density_ratio * policy_ratio
  # Multiply by discount to account for discounted behavior policy.
  weights = state_action_density_ratio * gamma ** all_data.time_step
  return np.sum(all_data.reward * weights) / np.sum(weights)
示例#2
0
    def solve(self,
              data,
              target_policy,
              regularizer=1e-8):
        """Solves for density ratios and then approximates target policy value.

        Args:
          data: The transition data store to use.
          target_policy: The policy whose value we want to estimate.
          regularizer: A small constant to add to matrices before inverting them or
            to floats before taking square root.

        Returns:
          Estimated average per-step reward of the target policy.
        """
        td_residuals = np.zeros([self._dimension, self._dimension])
        total_weights = np.zeros([self._dimension])
        initial_weights = np.zeros([self._dimension])
        for transition in data.iterate_once():
            nu_index = self._get_index(transition.state, transition.action)
            weight = self._gamma ** transition.time_step

            td_residuals[nu_index, nu_index] += weight
            total_weights[nu_index] += weight

            next_probs = target_policy.get_probabilities(transition.next_state)
            if not self._solve_for_state_action_ratio:
                policy_ratio = policy_lib.get_policy_ratio(data.policy, target_policy,
                                                           transition.state,
                                                           transition.action)

            # Need to weight next nu by importance weight.
            next_weight = (weight if self._solve_for_state_action_ratio else
                           policy_ratio * weight)
            for next_action, next_prob in enumerate(next_probs):
                next_nu_index = self._get_index(
                    transition.next_state, next_action)
                td_residuals[next_nu_index, nu_index] += (
                    -next_prob * self._gamma * next_weight)

            initial_probs = target_policy.get_probabilities(
                transition.initial_state)
            for initial_action, initial_prob in enumerate(initial_probs):
                initial_nu_index = self._get_index(transition.initial_state,
                                                   initial_action)
                initial_weights[initial_nu_index] += weight * initial_prob

        td_residuals /= np.sqrt(regularizer + total_weights)[None, :]
        td_errors = np.dot(td_residuals, td_residuals.T)
        self._nu = np.linalg.solve(
            td_errors + regularizer * np.eye(self._dimension),
            (1 - self._gamma) * initial_weights)
        self._zeta = np.dot(self._nu,
                            td_residuals) / np.sqrt(regularizer + total_weights)
        return self.estimate_average_reward(data, target_policy)
示例#3
0
    def solve(self, data, target_policy, baseline_policy=None):
        """Solves for density ratios and then approximates target policy value.

    Args:
      data: The transition data store to use.
      target_policy: The policy whose value we want to estimate.
      baseline_policy: The policy used to collect the data. If None,
        we default to data.policy.

    Returns:
      Estimated average per-step reward of the target policy.

    Raises:
      ValueError: If NaNs encountered in policy ratio computation.
    """
        if baseline_policy is None:
            baseline_policy = data.policy

        value_estimates = []
        for step in range(self._parameters.num_steps):
            batch = data.sample_batch(self._parameters.batch_size)
            feed_dict = {
                self._state: batch.state,
                self._action: batch.action,
                self._next_state: batch.next_state,
                self._initial_state: batch.initial_state,
                self._weights: self._parameters.gamma**batch.time_step,
            }

            # On-policy next action and initial action.
            feed_dict[self._next_action] = target_policy.sample_action(
                batch.next_state)
            feed_dict[self._initial_action] = target_policy.sample_action(
                batch.initial_state)

            if self._average_next_nu:
                next_probabilities = target_policy.get_probabilities(
                    batch.next_state)
                feed_dict[self._target_policy_next_probs] = next_probabilities

            policy_ratio = policy_lib.get_policy_ratio(baseline_policy,
                                                       target_policy,
                                                       batch.state,
                                                       batch.action)

            if np.any(np.isnan(policy_ratio)):
                raise ValueError('NaNs encountered in policy ratio: %s.' %
                                 policy_ratio)
            feed_dict[self._policy_ratio] = policy_ratio

            self._session.run(self._train_op, feed_dict=feed_dict)

            if step % self._parameters.log_every == 0:
                debug = self._session.run(self._debug, feed_dict=feed_dict)
                tf.logging.info('At step %d' % step)
                tf.logging.info('Debug: %s' % debug)
                value_estimate = self.estimate_average_reward(
                    data, target_policy)
                tf.logging.info('Estimated value: %s' % value_estimate)
                value_estimates.append(value_estimate)
                tf.logging.info(
                    'Estimated smoothed value: %s' %
                    np.mean(value_estimates[-self._parameters.smooth_over:]))

                if self._parameters.summary_writer:
                    summary = tf.Summary(value=[
                        tf.Summary.Value(tag='%sdebug' %
                                         self._parameters.summary_prefix,
                                         simple_value=debug),
                        tf.Summary.Value(tag='%svalue_estimate' %
                                         self._parameters.summary_prefix,
                                         simple_value=value_estimate)
                    ])
                    self._parameters.summary_writer.add_summary(summary, step)

        value_estimate = self.estimate_average_reward(data, target_policy)
        tf.logging.info('Estimated value: %s' % value_estimate)
        value_estimates.append(value_estimate)
        tf.logging.info(
            'Estimated smoothed value: %s' %
            np.mean(value_estimates[-self._parameters.smooth_over:]))

        # Return estimate that is smoothed over last few iterates.
        return np.mean(value_estimates[-self._parameters.smooth_over:])