def _create_variables(self): """Creates the variables needed for EMATensorNormalizer.""" self._mean_moving_avg = nest.map_structure( lambda spec: create_counter('mean', 0, spec.shape, tf.float32), self._tensor_spec) self._var_moving_avg = nest.map_structure( lambda spec: create_counter('var', 1, spec.shape, tf.float32), self._tensor_spec)
def _create_variables(self): """Uses self._scope and creates all variables needed for the normalizer.""" self._count = nest.map_structure( lambda spec: create_counter('count', 1e-8, spec.shape, tf.float32), self._tensor_spec) self._mean_sum = nest.map_structure( lambda spec: create_counter('mean_sum', 0, spec.shape, tf.float32), self._tensor_spec) self._var_sum = nest.map_structure( lambda spec: create_counter('var_sum', 0, spec.shape, tf.float32), self._tensor_spec)
def __init__(self, name='NumberOfEpisodes', dtype=tf.int64): super(NumberOfEpisodes, self).__init__(name=name) self.dtype = dtype self.number_episodes = common.create_counter(initial_value=0, dtype=self.dtype, shape=(), name='number_episodes')
def __init__(self, name='EnvironmentSteps', dtype=tf.int64): super(EnvironmentSteps, self).__init__(name=name) self.dtype = dtype self.environment_steps = common.create_counter( initial_value=0, dtype=self.dtype, shape=(), name='environment_steps')
def __init__(self, time_step_spec, action_spec, critic_network, actor_network, actor_optimizer, critic_optimizer, alpha_optimizer, actor_policy_ctor=actor_policy.ActorPolicy, squash_actions=True, target_update_tau=1.0, target_update_period=1, td_errors_loss_fn=tf.math.squared_difference, gamma=1.0, reward_scale_factor=1.0, initial_log_alpha=0.0, target_entropy=None, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, name=None): """Creates a SAC Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. critic_network: A function critic_network((observations, actions)) that returns the q_values for each observation and action. actor_network: A function actor_network(observation, action_spec) that returns action distribution. actor_optimizer: The optimizer to use for the actor network. critic_optimizer: The default optimizer to use for the critic network. alpha_optimizer: The default optimizer to use for the alpha variable. actor_policy_ctor: The policy class to use. squash_actions: Whether or not to use tanh to squash actions between -1 and 1. target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the elementwise TD errors loss. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. initial_log_alpha: Initial value for log_alpha. target_entropy: The target average policy entropy, for updating alpha. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._critic_network1 = critic_network self._critic_network2 = critic_network.copy(name='CriticNetwork2') self._target_critic_network1 = critic_network.copy( name='TargetCriticNetwork1') self._target_critic_network2 = critic_network.copy( name='TargetCriticNetwork2') self._actor_network = actor_network policy = actor_policy_ctor(time_step_spec=time_step_spec, action_spec=action_spec, actor_network=self._actor_network) self._log_alpha = common_utils.create_counter( 'initial_log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) # If target_entropy was not passed, set it to negative of the total number # of action dimensions. if target_entropy is None: flat_action_spec = tf.nest.flatten(action_spec) target_entropy = -np.sum([ np.product(single_spec.shape.as_list()) for single_spec in flat_action_spec ]) self._squash_actions = squash_actions self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer self._td_errors_loss_fn = td_errors_loss_fn self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._target_entropy = target_entropy self._gradient_clipping = gradient_clipping self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars super(SacAgent, self).__init__(time_step_spec, action_spec, policy=policy, collect_policy=policy, train_sequence_length=2, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars)
def testInitialValueWithShape(self): counter = common.create_counter('counter', 1, shape=(2, )) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(counter), [1, 1])
def testMultipleCounters(self): counter1 = common.create_counter('counter', 1) counter2 = common.create_counter('counter', 2) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter1), 1) self.assertEqual(self.evaluate(counter2), 2)
def testIncrement(self): counter = common.create_counter('counter', 0) inc_counter = counter.assign_add(1) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(inc_counter), 1)
def testInitialValue(self): counter = common.create_counter('counter', 1) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 1)
def testDefaults(self): counter = common.create_counter('counter') self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 0)
def __init__(self, time_step_spec, action_spec, optimizer=None, actor_net=None, value_net=None, importance_ratio_clipping=0.0, lambda_value=0.95, discount_factor=0.99, entropy_regularization=0.0, policy_l2_reg=0.0, value_function_l2_reg=0.0, value_pred_loss_coef=0.5, num_epochs=25, use_gae=False, use_td_lambda_return=False, normalize_rewards=True, reward_norm_clipping=10.0, normalize_observations=True, log_prob_clipping=0.0, kl_cutoff_factor=2.0, kl_cutoff_coef=1000.0, initial_adaptive_kl_beta=1.0, adaptive_kl_target=0.01, adaptive_kl_tolerance=0.3, gradient_clipping=None, check_numerics=False, debug_summaries=False, summarize_grads_and_vars=False): """Creates a PPO Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. optimizer: Optimizer to use for the agent. actor_net: A function actor_net(observations, action_spec) that returns tensor of action distribution params for each observation. Takes nested observation and returns nested action. value_net: A function value_net(time_steps) that returns value tensor from neural net predictions for each observation. Takes nested observation and returns batch of value_preds. importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For more detail, see explanation at the top of the doc. lambda_value: Lambda parameter for TD-lambda computation. discount_factor: Discount factor for return computation. entropy_regularization: Coefficient for entropy regularization loss term. policy_l2_reg: Coefficient for l2 regularization of policy weights. value_function_l2_reg: Coefficient for l2 regularization of value function weights. value_pred_loss_coef: Multiplier for value prediction loss to balance with policy gradient loss. num_epochs: Number of epochs for computing policy updates. use_gae: If True (default False), uses generalized advantage estimation for computing per-timestep advantage. Else, just subtracts value predictions from empirical return. use_td_lambda_return: If True (default False), uses td_lambda_return for training value function. (td_lambda_return = gae_advantage + value_predictions) normalize_rewards: If true, keeps moving variance of rewards and normalizes incoming rewards. reward_norm_clipping: Value above an below to clip normalized reward. normalize_observations: If true, keeps moving mean and variance of observations and normalizes incoming observations. log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN values. Default: no clipping. kl_cutoff_factor: If policy KL changes more than this much for any single timestep, adds a squared KL penalty to loss function. kl_cutoff_coef: Loss coefficient for kl cutoff term. initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive kl penalty. adaptive_kl_target: Desired kl target for policy updates. If actual kl is far from this target, adaptive_kl_beta will be updated. adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1 + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target, will cause adaptive_kl_beta to be updated. gradient_clipping: Norm length to clip gradients. Default: no clipping. check_numerics: If true, adds tf.check_numerics to help find NaN / Inf values. For debugging only. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If true, gradient summaries will be written. Raises: ValueError: If the actor_net is not a DistributionNetwork. """ if not isinstance(actor_net, network.DistributionNetwork): raise ValueError( 'actor_net must be an instance of a DistributionNetwork.') self._optimizer = optimizer self._actor_net = actor_net self._value_net = value_net self._importance_ratio_clipping = importance_ratio_clipping self._lambda = lambda_value self._discount_factor = discount_factor self._entropy_regularization = entropy_regularization self._policy_l2_reg = policy_l2_reg self._value_function_l2_reg = value_function_l2_reg self._value_pred_loss_coef = value_pred_loss_coef self._num_epochs = num_epochs self._use_gae = use_gae self._use_td_lambda_return = use_td_lambda_return self._reward_norm_clipping = reward_norm_clipping self._log_prob_clipping = log_prob_clipping self._kl_cutoff_factor = kl_cutoff_factor self._kl_cutoff_coef = kl_cutoff_coef self._adaptive_kl_target = adaptive_kl_target self._adaptive_kl_tolerance = adaptive_kl_tolerance self._gradient_clipping = gradient_clipping or 0.0 self._check_numerics = check_numerics if initial_adaptive_kl_beta > 0.0: # TODO(kbanoop): Rename create_variable. self._adaptive_kl_beta = common_utils.create_counter( 'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32) else: self._adaptive_kl_beta = None self._reward_normalizer = None if normalize_rewards: self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer( tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward') self._observation_normalizer = None if normalize_observations: self._observation_normalizer = ( tensor_normalizer.StreamingTensorNormalizer( time_step_spec.observation, scope='normalize_observations')) policy = greedy_policy.GreedyPolicy( ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=False)) collect_policy = ppo_policy.PPOPolicy( time_step_spec=time_step_spec, action_spec=action_spec, actor_network=actor_net, value_network=value_net, observation_normalizer=self._observation_normalizer, clip=False, collect=True) self._action_distribution_spec = (self._actor_net.output_spec) super(PPOAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars)