def time_step_spec(self): """Return spec for ActionTimeStep.""" return ActionTimeStep(step_type=tf.TensorSpec((), tf.int32), reward=tf.TensorSpec((), tf.float32), discount=tf.TensorSpec((), tf.float32), observation=self.observation_spec, prev_action=self.action_spec, env_id=tf.TensorSpec((), tf.int32))
def __init__(self, num_of_skills, feature_spec, hidden_size=256, reward_adapt_speed=8.0, encoding_net: Network = None, discriminator_net: Network = None, name="DIAYNAlgorithm"): """Create a DIAYNAlgorithm. Args: num_of_skills (int): number of skills hidden_size (int|tuple): size of hidden layer(s). If discriminator_net is None, a default discriminator_net with this hidden_size will be used. reward_adapt_speed (float): how fast to adapt the reward normalizer. rouphly speaking, the statistics for the normalization is calculated mostly based on the most recent T/speed samples, where T is the total number of samples. encoding_net (Network): network for encoding observation into a latent feature specified by feature_spec. Its input is the same as the input of this algorithm. discriminator_net (Network): network for predicting the skill labels based on the observation. """ skill_spec = tf.TensorSpec((num_of_skills, )) super().__init__(train_state_spec=skill_spec, name=name) flat_feature_spec = tf.nest.flatten(feature_spec) assert len(flat_feature_spec ) == 1, "DIAYNAlgorithm doesn't support nested feature_spec" self._num_skills = num_of_skills self._encoding_net = encoding_net if isinstance(hidden_size, int): hidden_size = (hidden_size, ) if discriminator_net is None: discriminator_net = EncodingNetwork( name="discriminator_net", input_tensor_spec=feature_spec, fc_layer_params=hidden_size, last_layer_size=self._num_skills, last_kernel_initializer=tf.initializers.Zeros()) self._discriminator_net = discriminator_net self._reward_normalizer = ScalarAdaptiveNormalizer( speed=reward_adapt_speed)
def _prepare_specs(self, algorithm): time_step_spec = self._env.time_step_spec() action_distribution_param_spec = tf.nest.map_structure( lambda spec: spec.input_params_spec, algorithm.action_distribution_spec) policy_step = algorithm.train_step(self.get_initial_time_step(), self._initial_state) info_spec = tf.nest.map_structure( lambda t: tf.TensorSpec(t.shape[1:], t.dtype), policy_step.info) self._training_info_spec = make_training_info( action_distribution=action_distribution_param_spec, action=self._env.action_spec(), step_type=time_step_spec.step_type, reward=time_step_spec.reward, discount=time_step_spec.discount, info=info_spec)
def __init__(self, observation_spec, num_of_goals, name="RandomCategoricalGoalGenerator"): """Create a RandomCategoricalGoalGenerator. Args: observation_spec (nested TensorSpec): representing the observations. num_of_goals (int): total number of goals the agent can sample from name (str): name of the algorithm """ goal_spec = tf.TensorSpec((num_of_goals, )) train_state_spec = GoalState(goal=goal_spec) super().__init__(observation_spec=observation_spec, action_spec=tensor_spec.BoundedTensorSpec( shape=(num_of_goals, ), dtype=tf.float32, minimum=0., maximum=1.), train_state_spec=train_state_spec, name=name) self._num_of_goals = num_of_goals self._p_goal = tf.ones(self._num_of_goals)
def __init__(self, batch_size, observation_spec, action_spec, soi_spec, soc_spec, split_observation_fn: Callable, network: Network = None, mi_r_scale=5000.0, hidden_size=128, buffer_size=100, n_objects=1, name="MISCAlgorithm"): """Create an MISCAlgorithm. Args: batch_size (int): batch size observation_spec (tf.TensorSpec): observation size action_spec (tf.TensorSpec): action size soi_spec (tf.TensorSpec): state of interest size soc_spec (tf.TensorSpec): state of context size split_observation_fn (Callable): split observation function. The input is observation and action concatenated. The outputs are the context states and states of interest network (Network): network for estimating mutual information (MI) mi_r_scale (float): scale factor of MI estimation hidden_size (int): number of hidden units in neural nets buffer_size (int): buffer size for the data buffer storing the trajectories for training the Mutual Information Neural Estimator n_objects: number of objects for estimating the mutual information reward name (str): the algorithm name, "MISCAlgorithm" """ super(MISCAlgorithm, self).__init__(train_state_spec=[observation_spec, action_spec], name=name) assert isinstance(observation_spec, tf.TensorSpec), \ "does not support nested observation_spec" assert isinstance(action_spec, tf.TensorSpec), \ "does not support nested action_spec" if network is None: network = EncodingNetwork(input_tensor_spec=[soc_spec, soi_spec], fc_layer_params=(hidden_size, ), activation_fn='relu', last_layer_size=1, last_activation_fn='tanh') self._network = network self._traj_spec = tf.TensorSpec(shape=[batch_size] + [ observation_spec.shape.as_list()[0] + action_spec.shape.as_list()[0] ], dtype=observation_spec.dtype) self._buffer_size = buffer_size self._buffer = DataBuffer(self._traj_spec, capacity=self._buffer_size) self._mi_r_scale = mi_r_scale self._n_objects = n_objects self._split_observation_fn = split_observation_fn self._batch_size = batch_size
def create_ac_algorithm(env, actor_fc_layers=(200, 100), value_fc_layers=(200, 100), encoding_conv_layers=(), encoding_fc_layers=(), use_rnns=False, use_icm=False, learning_rate=5e-5, algorithm_class=ActorCriticAlgorithm, loss_class=ActorCriticLoss, debug_summaries=False): """Create a simple ActorCriticAlgorithm. Args: env (TFEnvironment): A TFEnvironment actor_fc_layers (list[int]): list of fc layers parameters for actor network value_fc_layers (list[int]): list of fc layers parameters for value network encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network encoding_fc_layers (list[int]): list of fc layers parameters for encoding network use_rnns (bool): True if rnn should be used use_icm (bool): True if intrinsic curiosity module should be used learning_rate (float): learning rate algorithm_class (type): class of the algorithm. Can be ActorCriticAlgorithm or PPOAlgorithm loss_class (type): the class of the loss. The signature of its constructor: loss_class(action_spec, debug_summaries) debug_summaries (bool): True if debug summaries should be created. """ optimizer = tf.optimizers.Adam(learning_rate=learning_rate) if use_rnns: actor_net = ActorDistributionRnnNetwork( env.observation_spec(), env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = ValueRnnNetwork(env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = ActorDistributionNetwork(env.observation_spec(), env.action_spec(), fc_layer_params=actor_fc_layers) value_net = ValueNetwork(env.observation_spec(), fc_layer_params=value_fc_layers) encoding_net = None if encoding_fc_layers or encoding_conv_layers: encoding_net = EncodingNetwork( input_tensor_spec=env.observation_spec(), conv_layer_params=encoding_conv_layers, fc_layer_params=encoding_fc_layers) icm = None if use_icm: feature_spec = env.observation_spec() if encoding_net: feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ), dtype=tf.float32) icm = ICMAlgorithm(env.action_spec(), feature_spec, encoding_net=encoding_net) algorithm = algorithm_class(action_spec=env.action_spec(), actor_network=actor_net, value_network=value_net, intrinsic_curiosity_module=icm, loss_class=loss_class, optimizer=optimizer, debug_summaries=debug_summaries) return algorithm
def __init__(self, target_net: Network, predictor_net: Network, encoder_net: Network = None, reward_adapt_speed=None, observation_adapt_speed=None, observation_spec=None, learning_rate=None, clip_value=-1.0, stacked_frames=True, name="RNDAlgorithm"): """ Args: encoder_net (Network): a shared network that encodes observation to embeddings before being input to `target_net` or `predictor_net`; its parameters are not trainable target_net (Network): the random fixed network that generates target state embeddings to be fitted predictor_net (Network): the trainable network that predicts target embeddings. If fully trained given enough data, predictor_net will become target_net eventually. reward_adapt_speed (float): speed for adaptively normalizing intrinsic rewards; if None, no normalizer is used observation_adapt_speed (float): speed for adaptively normalizing observations. Only useful if `observation_spec` is not None. observation_spec (TensorSpec): the observation tensor spec; used for creating an adaptive observation normalizer learning_rate (float): the learning rate for prediction cost; if None, a global learning rate will be used clip_value (float): if positive, the rewards will be clipped to [-clip_value, clip_value]; only used for reward normalization stacked_frames (bool): a boolean flag indicating whether the input observation has stacked frames. If True, then we only keep the last frame for RND to make predictions on, as suggested by the original paper Burda et al. 2019. For Atari games, this flag is usually True (`frame_stacking==4`). name (str): """ optimizer = None if learning_rate is not None: optimizer = tf.optimizers.Adam(learning_rate=learning_rate) super(RNDAlgorithm, self).__init__( train_state_spec=(), optimizer=optimizer, name=name) self._encoder_net = encoder_net self._target_net = target_net # fixed self._predictor_net = predictor_net # trainable if reward_adapt_speed is not None: self._reward_normalizer = ScalarAdaptiveNormalizer( speed=reward_adapt_speed) self._reward_clip_value = clip_value else: self._reward_normalizer = None self._stacked_frames = stacked_frames if stacked_frames and (observation_spec is not None): # Assuming stacking in the last dim, we only keep the last frame. shape = observation_spec.shape new_shape = shape[:-1] + (1, ) observation_spec = tf.TensorSpec( shape=new_shape, dtype=observation_spec.dtype) # The paper suggests to also normalize observations, because the # original observation subspace might be small and the target network will # yield random embeddings that are indistinguishable self._observation_normalizer = None if observation_adapt_speed is not None: assert observation_spec is not None, \ "Observation normalizer requires its input tensor spec!" self._observation_normalizer = AdaptiveNormalizer( tensor_spec=observation_spec, speed=observation_adapt_speed)