def estimate_value_from_state_ratios( data, target_policy, gamma, state_ratio_fn): """Estimates value of policy given data and state density ratios. Args: data: The experience data to base the estimate on. target_policy: The policy whose value to estimate. gamma: Discount to use in the value calculation. state_ratio_fn: A function taking in batches of states and returning estimates of the ratio d^pi(s) / d^D(s), where d^pi(s) is the discounted occupancy of the target policy at state s and d^D(s) is the probability with which state s appears in the experience data. Returns: Estimated average per-step reward of the target policy. """ all_data = data.get_all() state_density_ratio = state_ratio_fn(all_data.state) policy_ratio = policy_lib.get_policy_ratio( data.policy, target_policy, all_data.state, all_data.action) state_action_density_ratio = state_density_ratio * policy_ratio # Multiply by discount to account for discounted behavior policy. weights = state_action_density_ratio * gamma ** all_data.time_step return np.sum(all_data.reward * weights) / np.sum(weights)
def solve(self, data, target_policy, regularizer=1e-8): """Solves for density ratios and then approximates target policy value. Args: data: The transition data store to use. target_policy: The policy whose value we want to estimate. regularizer: A small constant to add to matrices before inverting them or to floats before taking square root. Returns: Estimated average per-step reward of the target policy. """ td_residuals = np.zeros([self._dimension, self._dimension]) total_weights = np.zeros([self._dimension]) initial_weights = np.zeros([self._dimension]) for transition in data.iterate_once(): nu_index = self._get_index(transition.state, transition.action) weight = self._gamma ** transition.time_step td_residuals[nu_index, nu_index] += weight total_weights[nu_index] += weight next_probs = target_policy.get_probabilities(transition.next_state) if not self._solve_for_state_action_ratio: policy_ratio = policy_lib.get_policy_ratio(data.policy, target_policy, transition.state, transition.action) # Need to weight next nu by importance weight. next_weight = (weight if self._solve_for_state_action_ratio else policy_ratio * weight) for next_action, next_prob in enumerate(next_probs): next_nu_index = self._get_index( transition.next_state, next_action) td_residuals[next_nu_index, nu_index] += ( -next_prob * self._gamma * next_weight) initial_probs = target_policy.get_probabilities( transition.initial_state) for initial_action, initial_prob in enumerate(initial_probs): initial_nu_index = self._get_index(transition.initial_state, initial_action) initial_weights[initial_nu_index] += weight * initial_prob td_residuals /= np.sqrt(regularizer + total_weights)[None, :] td_errors = np.dot(td_residuals, td_residuals.T) self._nu = np.linalg.solve( td_errors + regularizer * np.eye(self._dimension), (1 - self._gamma) * initial_weights) self._zeta = np.dot(self._nu, td_residuals) / np.sqrt(regularizer + total_weights) return self.estimate_average_reward(data, target_policy)
def solve(self, data, target_policy, baseline_policy=None): """Solves for density ratios and then approximates target policy value. Args: data: The transition data store to use. target_policy: The policy whose value we want to estimate. baseline_policy: The policy used to collect the data. If None, we default to data.policy. Returns: Estimated average per-step reward of the target policy. Raises: ValueError: If NaNs encountered in policy ratio computation. """ if baseline_policy is None: baseline_policy = data.policy value_estimates = [] for step in range(self._parameters.num_steps): batch = data.sample_batch(self._parameters.batch_size) feed_dict = { self._state: batch.state, self._action: batch.action, self._next_state: batch.next_state, self._initial_state: batch.initial_state, self._weights: self._parameters.gamma**batch.time_step, } # On-policy next action and initial action. feed_dict[self._next_action] = target_policy.sample_action( batch.next_state) feed_dict[self._initial_action] = target_policy.sample_action( batch.initial_state) if self._average_next_nu: next_probabilities = target_policy.get_probabilities( batch.next_state) feed_dict[self._target_policy_next_probs] = next_probabilities policy_ratio = policy_lib.get_policy_ratio(baseline_policy, target_policy, batch.state, batch.action) if np.any(np.isnan(policy_ratio)): raise ValueError('NaNs encountered in policy ratio: %s.' % policy_ratio) feed_dict[self._policy_ratio] = policy_ratio self._session.run(self._train_op, feed_dict=feed_dict) if step % self._parameters.log_every == 0: debug = self._session.run(self._debug, feed_dict=feed_dict) tf.logging.info('At step %d' % step) tf.logging.info('Debug: %s' % debug) value_estimate = self.estimate_average_reward( data, target_policy) tf.logging.info('Estimated value: %s' % value_estimate) value_estimates.append(value_estimate) tf.logging.info( 'Estimated smoothed value: %s' % np.mean(value_estimates[-self._parameters.smooth_over:])) if self._parameters.summary_writer: summary = tf.Summary(value=[ tf.Summary.Value(tag='%sdebug' % self._parameters.summary_prefix, simple_value=debug), tf.Summary.Value(tag='%svalue_estimate' % self._parameters.summary_prefix, simple_value=value_estimate) ]) self._parameters.summary_writer.add_summary(summary, step) value_estimate = self.estimate_average_reward(data, target_policy) tf.logging.info('Estimated value: %s' % value_estimate) value_estimates.append(value_estimate) tf.logging.info( 'Estimated smoothed value: %s' % np.mean(value_estimates[-self._parameters.smooth_over:])) # Return estimate that is smoothed over last few iterates. return np.mean(value_estimates[-self._parameters.smooth_over:])