def get_fullbatch_average( dataset: OffpolicyDataset, limit: Optional[int] = None, by_steps: bool = True, truncate_episode_at: Optional[int] = None, reward_fn: Callable = None, weight_fn: Callable = None, gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]: """Computes average reward over full dataset. Args: dataset: The dataset to sample experience from. limit: If specified, the maximum number of steps/episodes to take from the dataset. by_steps: Whether to sample batches of steps (default) or episodes. truncate_episode_at: If sampling by episodes, where to truncate episodes from the environment, if at all. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. When sampling by episode, valid_steps is also passed into reward_fn. weight_fn: A function that takes in an EnvStep and returns a weight for that step. If not specified, defaults to gamma ** step_num. When sampling by episode, valid_steps is also passed into reward_fn. gamma: The discount factor to use for the default reward/weight functions. Returns: An estimate of the average reward. """ if reward_fn is None: if by_steps: reward_fn = _default_by_steps_reward_fn else: reward_fn = lambda *args: _default_by_episodes_reward_fn( *args, gamma=gamma) if weight_fn is None: if by_steps: weight_fn = lambda *args: _default_by_steps_weight_fn(*args, gamma=gamma) else: weight_fn = _default_by_episodes_weight_fn if by_steps: steps = dataset.get_all_steps(limit=limit) rewards = reward_fn(steps) weights = weight_fn(steps) else: episodes, valid_steps = dataset.get_all_episodes( truncate_episode_at=truncate_episode_at, limit=limit) rewards = reward_fn(episodes, valid_steps) weights = weight_fn(episodes, valid_steps) rewards = common_lib.reverse_broadcast(rewards, weights) weights = common_lib.reverse_broadcast(weights, rewards) if tf.rank(weights) < 2: return (tf.reduce_sum(rewards * weights, axis=0) / tf.reduce_sum(weights, axis=0)) return (tf.linalg.matmul(weights, rewards) / tf.reduce_sum(tf.math.reduce_mean(weights, axis=0)))
def _eval_constraint_and_regs(self, dataset: dataset_lib.OffpolicyDataset, target_policy: tf_policy.TFPolicy): """Get the residual term and the primal and dual regularizers during eval. Args: dataset: The dataset to sample experience from. target_policy: The policy whose value we want to estimate. Returns: The residual term (weighted by zeta), primal, and dual reg values. """ experience = dataset.get_all_steps(num_steps=2) env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], experience) next_env_step = tf.nest.map_structure(lambda t: t[:, 1, ...], experience) nu_values, _, _ = self._sample_value(self._nu_network, env_step) next_nu_values, _, _ = self._sample_average_value( self._nu_network, next_env_step, target_policy) zeta_values, neg_kl, _ = self._sample_value(self._zeta_network, env_step) discounts = self._gamma * env_step.discount bellman_residuals = ( common_lib.reverse_broadcast(discounts, nu_values) * next_nu_values - nu_values - self._norm_regularizer * self._lam) # Always include reward during eval bellman_residuals += self._reward_fn(env_step) constraint = tf.reduce_mean(zeta_values * bellman_residuals) f_nu = tf.reduce_mean(self._f_fn(nu_values)) f_zeta = tf.reduce_mean(self._f_fn(zeta_values)) return constraint, f_nu, f_zeta, tf.reduce_mean(neg_kl)
def train_loss(self, initial_env_step, env_step, next_env_step, policy): nu_values = self._get_value(self._nu_network, env_step) initial_nu_values = self._get_average_value(self._nu_network, initial_env_step, policy) next_nu_values = self._get_average_value(self._nu_network, next_env_step, policy) zeta_values = self._get_value(self._zeta_network, env_step) discounts = self._gamma * next_env_step.discount policy_ratio = 1.0 if not self._solve_for_state_action_ratio: tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step) policy_log_probabilities = policy.distribution( tfagents_step).action.log_prob(env_step.action) policy_ratio = tf.exp(policy_log_probabilities - env_step.get_log_probability()) bellman_residuals = (nu_values - common_lib.reverse_broadcast( discounts * policy_ratio, nu_values) * next_nu_values) zeta_loss = self._fstar_fn( zeta_values) - bellman_residuals * zeta_values if self._primal_form: nu_loss = (self._f_fn(bellman_residuals) - (1 - self._gamma) * initial_nu_values) else: nu_loss = -zeta_loss - (1 - self._gamma) * initial_nu_values return nu_loss, zeta_loss
def _get_average_value(self, network, env_step, policy): if self._solve_for_state_action_ratio: tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step) if self._categorical_action and self._num_samples is None: action_weights = policy.distribution( tfagents_step).action.probs_parameter() action_dtype = self._dataset_spec.action.dtype batch_size = tf.shape(action_weights)[0] num_actions = tf.shape(action_weights)[-1] actions = ( # Broadcast actions tf.ones([batch_size, 1], dtype=action_dtype) * tf.range(num_actions, dtype=action_dtype)[None, :]) else: batch_size = tf.shape(env_step.observation)[0] num_actions = self._num_samples action_weights = tf.ones([batch_size, num_actions]) / num_actions actions = tf.stack( [policy.action(tfagents_step).action for _ in range(num_actions)], axis=1) flat_actions = tf.reshape(actions, [batch_size * num_actions] + actions.shape[2:].as_list()) flat_observations = tf.reshape( tf.tile(env_step.observation[:, None, ...], [1, num_actions] + [1] * len(env_step.observation.shape[1:])), [batch_size * num_actions] + env_step.observation.shape[1:].as_list()) flat_values, _ = network((flat_observations, flat_actions)) values = tf.reshape(flat_values, [batch_size, num_actions] + flat_values.shape[1:].as_list()) return tf.reduce_sum( values * common_lib.reverse_broadcast(action_weights, values), axis=1) else: return network((env_step.observation,))[0]
def _get_nu_loss(self, initial_env_step, env_step, next_env_step, policy): """Get nu_loss for both upper and lower confidence intervals.""" nu_index = self._get_index(env_step.observation, env_step.action) nu_values = tf.gather(self._nu, nu_index) initial_nu_values = self._get_average_value(self._nu, initial_env_step, policy) next_nu_values = self._get_average_value(self._nu, next_env_step, policy) rewards = self._reward_fn(env_step) discounts = self._gamma * env_step.discount policy_ratio = 1.0 if not self._solve_for_state_action_ratio: tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step) policy_log_probabilities = policy.distribution( tfagents_step).action.log_prob(env_step.action) policy_ratio = tf.exp(policy_log_probabilities - env_step.get_log_probability()) bellman_residuals = ( -nu_values + common_lib.reverse_broadcast( rewards, tf.convert_to_tensor(nu_values)) + common_lib.reverse_broadcast(discounts * policy_ratio, tf.convert_to_tensor(nu_values)) * next_nu_values) bellman_residuals *= self._algae_alpha_sign init_nu_loss = ((1 - self._gamma) * initial_nu_values * self._algae_alpha_sign) nu_loss = (tf.math.abs(self._algae_alpha) * tf.math.square( bellman_residuals / tf.math.abs(self._algae_alpha)) / 2.0 + init_nu_loss) if self._weight_by_gamma: weights = tf.expand_dims(self._gamma**tf.cast( env_step.step_num, tf.float32), axis=1) weights /= 1e-6 + tf.reduce_mean(weights) nu_loss *= weights return nu_loss
def train_loss(self, initial_env_step, env_step, next_env_step, policy): nu_values = self._get_value(self._nu_network, env_step) initial_nu_values = self._get_average_value(self._nu_network, initial_env_step, policy) next_nu_values = self._get_average_value(self._nu_network, next_env_step, policy) zeta_values = self._get_value(self._zeta_network, env_step) rewards = self._reward_fn(env_step) discounts = self._gamma * env_step.discount policy_ratio = 1.0 if not self._solve_for_state_action_ratio: tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step) policy_log_probabilities = policy.distribution( tfagents_step).action.log_prob(env_step.action) policy_ratio = tf.exp(policy_log_probabilities - env_step.get_log_probability()) bellman_residuals = ( -nu_values + common_lib.reverse_broadcast(rewards, nu_values) + common_lib.reverse_broadcast(discounts * policy_ratio, nu_values) * next_nu_values) bellman_residuals *= self._algae_alpha_sign #print(initial_nu_values, bellman_residuals) zeta_loss = ( self._algae_alpha_abs * self._fstar_fn(zeta_values) - bellman_residuals * zeta_values) init_nu_loss = ((1 - self._gamma) * initial_nu_values * self._algae_alpha_sign) if self._primal_form: nu_loss = ( self._algae_alpha_abs * self._f_fn(bellman_residuals / self._algae_alpha_abs) + init_nu_loss) else: nu_loss = -zeta_loss + init_nu_loss if self._weight_by_gamma: weights = self._gamma**tf.cast(env_step.step_num, tf.float32)[:, None] weights /= 1e-6 + tf.reduce_mean(weights) nu_loss *= weights zeta_loss *= weights return nu_loss, zeta_loss
def weight_fn(env_step): zeta = self._get_value(self._zeta_network, env_step) policy_ratio = 1.0 if not self._solve_for_state_action_ratio: tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(env_step) target_log_probabilities = target_policy.distribution( tfagents_timestep).action.log_prob(env_step.action) policy_ratio = tf.exp(target_log_probabilities - env_step.get_log_probability()) return zeta * common_lib.reverse_broadcast(policy_ratio, zeta)
def train_loss(self, initial_env_step, env_step, next_env_step, policy): nu_values, _, eps = self._sample_value(self._nu_network, env_step) initial_nu_values, _, _ = self._sample_average_value( self._nu_network, initial_env_step, policy) next_nu_values, _, _ = self._sample_average_value( self._nu_network, next_env_step, policy) zeta_values, zeta_neg_kl, _ = self._sample_value( self._zeta_network, env_step, eps) discounts = self._gamma * env_step.discount policy_ratio = 1.0 if not self._solve_for_state_action_ratio: tfagents_step = dataset_lib.convert_to_tfagents_timestep(env_step) policy_log_probabilities = policy.distribution( tfagents_step).action.log_prob(env_step.action) policy_ratio = tf.exp(policy_log_probabilities - env_step.get_log_probability()) bellman_residuals = ( common_lib.reverse_broadcast(discounts * policy_ratio, nu_values) * next_nu_values - nu_values - self._norm_regularizer * self._lam) if not self._zero_reward: bellman_residuals += policy_ratio * self._reward_fn(env_step) zeta_loss = -zeta_values * bellman_residuals nu_loss = (1 - self._gamma) * initial_nu_values lam_loss = self._norm_regularizer * self._lam if self._primal_form: nu_loss += self._fstar_fn(bellman_residuals) lam_loss = lam_loss + self._fstar_fn(bellman_residuals) else: nu_loss += zeta_values * bellman_residuals lam_loss = lam_loss - self._norm_regularizer * zeta_values * self._lam nu_loss += self._primal_regularizer * self._f_fn(nu_values) zeta_loss += self._dual_regularizer * self._f_fn(zeta_values) zeta_loss -= self._kl_regularizer * tf.reduce_mean(zeta_neg_kl) if self._weight_by_gamma: weights = self._gamma**tf.cast(env_step.step_num, tf.float32)[:, None] weights /= 1e-6 + tf.reduce_mean(weights) nu_loss *= weights zeta_loss *= weights return nu_loss, zeta_loss, lam_loss
def get_minibatch_average( dataset: Dataset, batch_size: int, num_batches: int = 1, by_steps: bool = True, truncate_episode_at: Optional[int] = None, reward_fn: Callable = None, weight_fn: Callable = None, gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]: """Computes average reward via randomly sampled mini-batches. Samples steps or episodes from the dataset and computes average reward. Args: dataset: The dataset to sample experience from. batch_size: The number of episodes to sample per batch. num_batches: The number of batches to use for estimation. by_steps: Whether to sample batches of steps (default) or episodes. truncate_episode_at: If sampling by episodes, where to truncate episodes from the environment, if at all. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. When sampling by episode, valid_steps is also passed into reward_fn. weight_fn: A function that takes in an EnvStep and returns a weight for that step. If not specified, defaults to gamma ** step_num. When sampling by episode, valid_steps is also passed into reward_fn. gamma: The discount factor to use for the default reward/weight functions. Returns: An estimate of the average reward. """ if reward_fn is None: if by_steps: reward_fn = _default_by_steps_reward_fn else: reward_fn = lambda *args: _default_by_episodes_reward_fn( *args, gamma=gamma) if weight_fn is None: if by_steps: weight_fn = lambda *args: _default_by_steps_weight_fn(*args, gamma=gamma) else: weight_fn = _default_by_episodes_weight_fn total_reward = 0. total_weight = 0. for _ in range(num_batches): if by_steps: if isinstance(dataset, OnpolicyDataset): steps = dataset.get_step(num_steps=batch_size) else: steps = dataset.get_step(batch_size) rewards = reward_fn(steps) weights = weight_fn(steps) else: episodes, valid_steps = dataset.get_episode( batch_size, truncate_episode_at=truncate_episode_at) rewards = reward_fn(episodes, valid_steps) weights = weight_fn(episodes, valid_steps) rewards = common_lib.reverse_broadcast(rewards, weights) weights = common_lib.reverse_broadcast(weights, rewards) total_reward += tf.reduce_sum(rewards * weights, axis=0) total_weight += tf.reduce_sum(weights, axis=0) return total_reward / total_weight