def project_to_output_distribution(inputs, output_spec, project_to_discrete, project_to_continuous, outer_rank=1, scope='project_to_output'): """Project a batch of inputs to a distribution object. Args: inputs: An input Tensor of shape [batch_size, None]. output_spec: A single output spec. project_to_discrete: The method to use for projecting a discrete output. project_to_continuous: The method to use for projecting a continuous output. outer_rank: The number of outer dimensions of inputs to consider batch dimensions and to treat as batch dimensions of output distribution. scope: The variable scope. Returns: A distribution object corresponding to the arguments and output spec provided. Raises: ValueError: If the distribution type of output_spec is unclear. """ with tf.variable_scope(scope): if tensor_spec.is_discrete(output_spec): return project_to_discrete(inputs, output_spec, outer_rank=outer_rank) elif tensor_spec.is_continuous(output_spec): return project_to_continuous(inputs, output_spec, outer_rank=outer_rank) else: raise ValueError('Output spec corresponds to unknown distribution.')
def testExclusive(self, dtype): if dtype == tf.string: self.skipTest("Not compatible with string type.") spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs( tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec), True)
def calc_default_target_entropy(spec): """Calc default target entropy Args: spec (TensorSpec): action spec Returns: """ dims = np.product(spec.shape.as_list()) if tensor_spec.is_continuous(spec): e = -1 else: min_prob = 0.01 p = min_prob q = 1 - p e = -p * np.log(p) - q * np.log(q) return e * dims
def calc_default_target_entropy(spec): """Calc default target entropy Args: spec (TensorSpec): action spec Returns: """ zeros = np.zeros(spec.shape) min_max = np.broadcast(spec.minimum, spec.maximum, zeros) cont = tensor_spec.is_continuous(spec) min_prob = 0.01 log_mp = np.log(min_prob) # continuous: suppose the prob concentrates on a delta of 0.01*(M-m) # discrete: ignore the entry of 0.99 and uniformly distribute probs on rest e = np.sum([(np.log(M - m) + log_mp if cont else min_prob * (np.log(M - m) - log_mp)) for m, M, _ in min_max]) return e
def calc_default_max_entropy(spec, fraction=0.8): """Calc default max entropy Args: spec (TensorSpec): action spec fraction (float): this fraction of the theoretical entropy upper bound will be used as the max entropy Returns: A default max entropy for adjusting the entropy weight """ assert fraction <= 1.0 and fraction > 0 zeros = np.zeros(spec.shape) min_max = np.broadcast(spec.minimum, spec.maximum, zeros) cont = tensor_spec.is_continuous(spec) # use uniform distributions to compute upper bounds e = np.sum([(np.log(M - m) * (fraction if M - m > 1 else 1.0 / fraction) if cont else np.log(M - m + 1) * fraction) for m, M, _ in min_max]) return e
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, cloning_network: network.Network, optimizer: types.Optimizer, num_outer_dims: Literal[1, 2] = 1, # pylint: disable=bad-whitespace epsilon_greedy: types.Float = 0.1, loss_fn: Optional[Callable[[types.NestedTensor, bool], types.Tensor]] = None, gradient_clipping: Optional[types.Float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates an instance of a Behavioral Cloning agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. cloning_network: A `tf_agents.networks.Network` to be used by the agent. The network will be called as ``` network(observation, step_type=step_type, network_state=initial_state) ``` and must return a 2-tuple with elements `(output, next_network_state)` optimizer: The optimizer to use for training. num_outer_dims: The number of outer dimensions for the agent. Must be either 1 or 2. If 2, training will require both a batch_size and time dimension on every Tensor; if 1, training will require only a batch_size outer dimension. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if actions are discrete) loss_fn: A function for computing the error between the output of the cloning network and the action that was taken. If None, the loss depends on the action dtype. The `loss_fn` is called with parameters: `(experience, training)`, and must return a loss value for each element of the batch. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._cloning_network = cloning_network self._optimizer = optimizer self._gradient_clipping = gradient_clipping action_spec = tensor_spec.from_spec(action_spec) flat_action_spec = tf.nest.flatten(action_spec) continuous_specs = [ tensor_spec.is_continuous(s) for s in flat_action_spec ] if not flat_action_spec: raise ValueError( 'The `action_spec` must contain at least one action.') single_discrete_scalar_action = ( len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0 and not tensor_spec.is_continuous(flat_action_spec[0])) single_continuous_action = (len(flat_action_spec) == 1 and tensor_spec.is_continuous( flat_action_spec[0])) if (not loss_fn and not single_discrete_scalar_action and not single_continuous_action): raise ValueError( 'A `loss_fn` must be provided unless there is a single, scalar ' 'discrete action or a single (scalar or non-scalar) continuous ' 'action.') self._network_output_spec = cloning_network.create_variables( time_step_spec.observation) # If there is a mix of continuous and discrete actions we want to use an # actor policy so we can use the `setup_as_continuous` method as long as the # user provided a custom loss_fn which we verified above. if any(continuous_specs): policy, collect_policy = self._setup_as_continuous( time_step_spec, action_spec, loss_fn) else: policy, collect_policy = self._setup_as_discrete( time_step_spec, action_spec, loss_fn, epsilon_greedy) super(BehavioralCloningAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory( self.data_context, sequence_length=None, num_outer_dims=num_outer_dims)
def testExclusive(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs( tensor_spec.is_discrete(spec) ^ tensor_spec.is_continuous(spec), True)
def testIsContinuous(self, dtype): spec = tensor_spec.TensorSpec((2, 3), dtype=dtype) self.assertIs(tensor_spec.is_continuous(spec), dtype.is_floating)
def __init__(self, action_spec, actor_network: Network, critic_network: Network, critic_loss=None, target_entropy=None, initial_log_alpha=0.0, target_update_tau=0.05, target_update_period=1, dqda_clipping=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, gradient_clipping=None, train_step_counter=None, debug_summaries=False, name="SacAlgorithm"): """Create a SacAlgorithm Args: action_spec (nested BoundedTensorSpec): representing the actions. actor_network (Network): The network will be called with call(observation, step_type). critic_network (Network): The network will be called with call(observation, action, step_type). critic_loss (None|OneStepTDLoss): an object for calculating critic loss. If None, a default OneStepTDLoss will be used. initial_log_alpha (float): initial value for variable log_alpha target_entropy (float|None): The target average policy entropy, for updating alpha. target_update_tau (float): Factor for soft update of the target networks. target_update_period (int): Period for soft update of the target networks. dqda_clipping (float): when computing the actor loss, clips the gradient dqda element-wise between [-dqda_clipping, dqda_clipping]. Does not perform clipping if dqda_clipping == 0. actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor. critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic. alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha. gradient_clipping (float): Norm length to clip gradients. train_step_counter (tf.Variable): An optional counter to increment every time the a new iteration is started. If None, it will use tf.summary.experimental.get_step(). If this is still None, a counter will be created. debug_summaries (bool): True if debug summaries should be created. name (str): The name of this algorithm. """ critic_network1 = critic_network critic_network2 = critic_network.copy(name='CriticNetwork2') log_alpha = tfa_common.create_variable(name='log_alpha', initial_value=initial_log_alpha, dtype=tf.float32, trainable=True) super().__init__( action_spec, train_state_spec=SacState( share=SacShareState(actor=actor_network.state_spec), actor=SacActorState(critic1=critic_network.state_spec, critic2=critic_network.state_spec), critic=SacCriticState( critic1=critic_network.state_spec, critic2=critic_network.state_spec, target_critic1=critic_network.state_spec, target_critic2=critic_network.state_spec)), action_distribution_spec=actor_network.output_spec, predict_state_spec=actor_network.state_spec, optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer], get_trainable_variables_func=[ lambda: actor_network.trainable_variables, lambda: (critic_network1.trainable_variables + critic_network2. trainable_variables), lambda: [log_alpha] ], gradient_clipping=gradient_clipping, train_step_counter=train_step_counter, debug_summaries=debug_summaries, name=name) self._log_alpha = log_alpha self._actor_network = actor_network self._critic_network1 = critic_network1 self._critic_network2 = critic_network2 self._target_critic_network1 = self._critic_network1.copy( name='TargetCriticNetwork1') self._target_critic_network2 = self._critic_network2.copy( name='TargetCriticNetwork2') self._actor_optimizer = actor_optimizer self._critic_optimizer = critic_optimizer self._alpha_optimizer = alpha_optimizer if critic_loss is None: critic_loss = OneStepTDLoss(debug_summaries=debug_summaries) self._critic_loss = critic_loss flat_action_spec = tf.nest.flatten(self._action_spec) self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0]) if target_entropy is None: target_entropy = np.sum( list( map(dist_utils.calc_default_target_entropy, flat_action_spec))) self._target_entropy = target_entropy self._dqda_clipping = dqda_clipping self._update_target = common.get_target_updater( models=[self._critic_network1, self._critic_network2], target_models=[ self._target_critic_network1, self._target_critic_network2 ], tau=target_update_tau, period=target_update_period) tfa_common.soft_variables_update( self._critic_network1.variables, self._target_critic_network1.variables, tau=1.0) tfa_common.soft_variables_update( self._critic_network2.variables, self._target_critic_network2.variables, tau=1.0)
def create_sac_algorithm(env, actor_fc_layers=(100, 100), critic_fc_layers=(100, 100), use_rnns=False, alpha_learning_rate=5e-3, actor_learning_rate=5e-3, critic_learning_rate=5e-3, debug_summaries=False): """Create a simple SacAlgorithm. Args: env (TFEnvironment): A TFEnvironment actor_fc_layers (list[int]): list of fc layers parameters for actor network critic_fc_layers (list[int]): list of fc layers parameters for critic network use_rnns (bool): True if rnn should be used alpha_learning_rate (float): learning rate for alpha actor_learning_rate (float) : learning rate for actor network critic_learning_rate (float) : learning rate for critic network debug_summaries (bool): True if debug summaries should be created """ observation_spec = env.observation_spec() action_spec = env.action_spec() is_continuous = tensor_spec.is_continuous(tf.nest.flatten(action_spec)[0]) if use_rnns: actor_net = ActorDistributionRnnNetwork( observation_spec, action_spec, input_fc_layer_params=actor_fc_layers, output_fc_layer_params=()) if is_continuous: critic_net = CriticRnnNetwork( (observation_spec, action_spec), observation_fc_layer_params=(), action_fc_layer_params=(), output_fc_layer_params=(), joint_fc_layer_params=critic_fc_layers) else: critic_net = QRnnNetwork(observation_spec, action_spec, output_fc_layer_params=(), input_fc_layer_params=critic_fc_layers) else: actor_net = ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=actor_fc_layers) if is_continuous: critic_net = CriticNetwork((observation_spec, action_spec), joint_fc_layer_params=critic_fc_layers) else: critic_net = QNetwork(observation_spec, action_spec, fc_layer_params=critic_fc_layers) actor_optimizer = tf.optimizers.Adam(learning_rate=actor_learning_rate) critic_optimizer = tf.optimizers.Adam(learning_rate=critic_learning_rate) alpha_optimizer = tf.optimizers.Adam(learning_rate=alpha_learning_rate) return SacAlgorithm(action_spec=action_spec, actor_network=actor_net, critic_network=critic_net, actor_optimizer=actor_optimizer, critic_optimizer=critic_optimizer, alpha_optimizer=alpha_optimizer, debug_summaries=debug_summaries)
def testIsContinuous(self, dtype): spec = array_spec.ArraySpec((2, 3), dtype=dtype) self.assertIs(tensor_spec.is_continuous(spec), issubclass(np.dtype(dtype).type, np.floating))
def _validate_action_spec(action_spec): if not tensor_spec.is_continuous(action_spec): raise ValueError( 'OU Noise is applicable only to continuous actions.')
def normal(inputs, output_spec, outer_rank=1, projection_layer=default_fully_connected, mean_transform=tanh_squash_to_spec, std_initializer=tf.zeros_initializer(), std_transform=tf.exp, distribution_cls=tfp.distributions.Normal): """Project a batch of inputs to a batch of means and standard deviations. Given an output spec for a single tensor continuous action, produces a neural net layer converting inputs to a normal distribution matching the spec. The mean is derived from a fully connected linear layer as mean_transform(layer_output, output_spec). The std is fixed to a single trainable tensor (thus independent of the inputs). Specifically, std is parameterized as std_transform(variable). Args: inputs: An input Tensor of shape [batch_size, ?]. output_spec: An output spec (either BoundedArraySpec or BoundedTensorSpec). outer_rank: The number of outer dimensions of inputs to consider batch dimensions and to treat as batch dimensions of output distribution. projection_layer: Function taking in inputs, num_elements, scope and returning a projection of inputs to a Tensor of width num_elements. mean_transform: A function taking in layer output and the output_spec, returning the means. Defaults to tanh_squash_to_spec. std_initializer: Initializer for std_dev variables. std_transform: The function applied to the trainable std variable. For example, tf.exp (default), tf.nn.softplus. distribution_cls: The distribution class to use for output distribution. Default is tfp.distributions.Normal. Returns: A tf.distribution.Normal object in which the standard deviation is not dependent on input. Raises: ValueError: If output_spec is invalid. """ if not tensor_spec.is_bounded(output_spec): raise ValueError('Input output_spec is of invalid type ' '%s.' % type(output_spec)) if not tensor_spec.is_continuous(output_spec): raise ValueError('Output is not continuous.') batch_squash = utils.BatchSquash(outer_rank) inputs = batch_squash.flatten(inputs) means = projection_layer(inputs, output_spec.shape.num_elements(), scope='means') stds = tf.contrib.layers.bias_add( tf.zeros_like(means), # Independent of inputs. initializer=std_initializer, scope='stds', activation_fn=None) means = tf.reshape(means, [-1] + output_spec.shape.as_list()) means = mean_transform(means, output_spec) means = tf.cast(means, output_spec.dtype) stds = tf.reshape(stds, [-1] + output_spec.shape.as_list()) stds = std_transform(stds) stds = tf.cast(stds, output_spec.dtype) means, stds = batch_squash.unflatten(means), batch_squash.unflatten(stds) return distribution_cls(means, stds)
def __init__(self, x_spec, y_spec, model=None, fc_layers=(256, ), sampler='buffer', buffer_size=65536, optimizer: tf.optimizers.Optimizer = None, estimator_type='DV', averager=ScalarAdaptiveAverager(), name="MIEstimator"): """Create a MIEstimator. Args: x_spec (nested TensorSpec): spec of x y_spec (nested TensorSpec): spec of y model (Network): can be called as model([x, y]) and return a Tensor with shape=[batch_size, 1]. If None, a default MLP with fc_layers will be created. fc_layers (tuple[int]): size of hidden layers. Only used if model is None. sampler (str): type of sampler used to get samples from marginal distribution, should be one of ['buffer', 'double_buffer', 'shuffle', 'shift'] buffer_size (int): capacity of buffer for storing y for sampler 'buffer' and 'double_buffer' optimzer (tf.optimizers.Optimzer): optimizer estimator_type (str): one of 'DV', 'KLD' or 'JSD' averager (EMAverager): averager used to maintain a moving average of exp(T). Only used for 'DV' estimator name (str): name of this estimator """ assert estimator_type in ['ML', 'DV', 'KLD', 'JSD' ], "Wrong estimator_type %s" % estimator_type super().__init__(train_state_spec=(), optimizer=optimizer, name=name) self._x_spec = x_spec self._y_spec = y_spec if model is None: if estimator_type == 'ML': model = TFAEncodingNetwork( name="MIEstimator", input_tensor_spec=x_spec, fc_layer_params=fc_layers, preprocessing_combiner=NestConcatenate(axis=-1)) else: model = EncodingNetwork( name="MIEstimator", input_tensor_spec=[x_spec, y_spec], fc_layer_params=fc_layers, last_layer_size=1) self._model = model self._type = estimator_type if sampler == 'buffer': self._y_buffer = DataBuffer(y_spec, capacity=buffer_size) self._sampler = self._buffer_sampler elif sampler == 'double_buffer': self._x_buffer = DataBuffer(x_spec, capacity=buffer_size) self._y_buffer = DataBuffer(y_spec, capacity=buffer_size) self._sampler = self._double_buffer_sampler elif sampler == 'shuffle': self._sampler = self._shuffle_sampler elif sampler == 'shift': self._sampler = self._shift_sampler else: raise TypeError("Wrong type for sampler %s" % sampler) if estimator_type == 'DV': self._mean_averager = averager if estimator_type == 'ML': assert isinstance( y_spec, tf.TensorSpec), ("Currently, 'ML' does " "not support nested y_spec: %s" % y_spec) assert tensor_spec.is_continuous(y_spec), ( "Currently, 'ML' does " "not support discreted y_spec: %s" % y_spec) self._delta_loc_layer = tf.keras.layers.Dense( y_spec.shape[-1], kernel_initializer=tf.initializers.Zeros(), bias_initializer=tf.initializers.Zeros(), name='delta_loc_layer') self._delta_scale_layer = tf.keras.layers.Dense( y_spec.shape[-1], kernel_initializer=tf.initializers.Zeros(), bias_initializer=tf.keras.initializers.Constant( value=math.log(math.e - 1)), name='delta_scale_layer')