def observe(self, exp): # The shape is [learn_queue_cap, unroll_length, env_batch_size, ...] exp = tf.nest.map_structure(lambda e: common.transpose2(e, 1, 2), exp) # flatten the shape (num_envs, env_batch_size) self._experience = tf.nest.map_structure(flatten_once, exp) if self._batch_size is None: self._batch_size = self._experience.step_type.shape[0]
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data state (tuple): state for MISC (previous observation, previous previous action) calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: tuple of observation and previous action info: (MISCInfo): """ feature = time_step.observation prev_action = time_step.prev_action feature = tf.concat([feature_state, prev_action], axis=-1) prev_feature = tf.concat(state, axis=-1) feature_reshaped = tf.expand_dims(feature, axis=1) prev_feature_reshaped = tf.expand_dims(prev_feature, axis=1) feature_pair = tf.concat([prev_feature_reshaped, feature_reshaped], 1) feature_reshaped_tran = transpose2(feature_reshaped, 1, 0) def add_batch(): self._buffer.add_batch(feature_reshaped_tran) if calc_intrinsic_reward: add_batch() if self._n_objects < 2: obs_tau_excludes_goal, obs_tau_achieved_goal = \ self._split_observation_fn(feature_pair) loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal) elif self._n_objects == 2: obs_tau_excludes_goal, obs_tau_achieved_goal_1, obs_tau_achieved_goal_2 \ = self._split_observation_fn( feature_pair) loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1) loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2) loss = loss_1 + loss_2 intrinsic_reward = () if calc_intrinsic_reward: # scale/normalize the MISC intrinsic reward if self._n_objects < 2: intrinsic_reward = tf.clip_by_value(self._mi_r_scale * loss, 0, 1) elif self._n_objects == 2: intrinsic_reward = tf.clip_by_value( self._mi_r_scale * loss_1, 0, 1) + 1 * tf.clip_by_value(self._mi_r_scale * loss_2, 0, 1) return AlgorithmStep( outputs=(), state=[feature_state, prev_action], \ info=MISCInfo(reward=intrinsic_reward))
def calc_loss(self, info: MISCInfo): feature_tau_sampled = self._buffer.get_batch( batch_size=self._buffer_size) feature_tau_sampled_tran = transpose2(feature_tau_sampled, 1, 0) if self._n_objects < 2: obs_tau_excludes_goal, obs_tau_achieved_goal = ( self._split_observation_fn(feature_tau_sampled_tran)) loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal) elif self._n_objects == 2: (obs_tau_excludes_goal, obs_tau_achieved_goal_1, obs_tau_achieved_goal_2 ) = self._split_observation_fn(feature_tau_sampled_tran) loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1) loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2) loss = loss_1 + loss_2 neg_loss = -loss neg_loss_scalar = tf.reduce_mean(neg_loss) return LossInfo(scalar_loss=neg_loss_scalar)
def get_training_exps(self): """ Get training experiences from the learning queue Returns: exp (Experience): env_id (tf.tensor): if not None, has the shape of (`num_envs`). Each element of `env_ids` indicates which batched env the data come from. steps (int): how many environment steps this batch of exps contain """ batch = make_learning_batch(*self._tfq.learn_queue.dequeue_all()) # convert the batch to the experience format exp = make_experience(batch.time_step, batch.policy_step, batch.act_dist_param) # make the exp batch major for each environment exp = tf.nest.map_structure(lambda e: common.transpose2(e, 1, 2), exp) num_envs, unroll_length, env_batch_size \ = batch.time_step.observation.shape[:3] steps = num_envs * unroll_length * env_batch_size return exp, batch.env_id, steps
def _make_time_major(nest): """Put the time dim to axis=0.""" return tf.nest.map_structure(lambda x: common.transpose2(x, 0, 1), nest)