Exemplo n.º 1
0
 def observe(self, exp):
     # The shape is [learn_queue_cap, unroll_length, env_batch_size, ...]
     exp = tf.nest.map_structure(lambda e: common.transpose2(e, 1, 2), exp)
     # flatten the shape (num_envs, env_batch_size)
     self._experience = tf.nest.map_structure(flatten_once, exp)
     if self._batch_size is None:
         self._batch_size = self._experience.step_type.shape[0]
Exemplo n.º 2
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data
            state (tuple): state for MISC (previous observation,
                previous previous action)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: tuple of observation and previous action
                info: (MISCInfo):
        """
        feature = time_step.observation
        prev_action = time_step.prev_action
        feature = tf.concat([feature_state, prev_action], axis=-1)
        prev_feature = tf.concat(state, axis=-1)

        feature_reshaped = tf.expand_dims(feature, axis=1)
        prev_feature_reshaped = tf.expand_dims(prev_feature, axis=1)
        feature_pair = tf.concat([prev_feature_reshaped, feature_reshaped], 1)
        feature_reshaped_tran = transpose2(feature_reshaped, 1, 0)

        def add_batch():
            self._buffer.add_batch(feature_reshaped_tran)

        if calc_intrinsic_reward:
            add_batch()

        if self._n_objects < 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal = \
                self._split_observation_fn(feature_pair)
            loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal)
        elif self._n_objects == 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal_1, obs_tau_achieved_goal_2 \
            = self._split_observation_fn(
                feature_pair)
            loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1)
            loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2)
            loss = loss_1 + loss_2

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            # scale/normalize the MISC intrinsic reward
            if self._n_objects < 2:
                intrinsic_reward = tf.clip_by_value(self._mi_r_scale * loss, 0,
                                                    1)
            elif self._n_objects == 2:
                intrinsic_reward = tf.clip_by_value(
                    self._mi_r_scale * loss_1, 0,
                    1) + 1 * tf.clip_by_value(self._mi_r_scale * loss_2, 0, 1)

        return AlgorithmStep(
            outputs=(), state=[feature_state, prev_action], \
            info=MISCInfo(reward=intrinsic_reward))
Exemplo n.º 3
0
    def calc_loss(self, info: MISCInfo):
        feature_tau_sampled = self._buffer.get_batch(
            batch_size=self._buffer_size)
        feature_tau_sampled_tran = transpose2(feature_tau_sampled, 1, 0)
        if self._n_objects < 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal = (
                self._split_observation_fn(feature_tau_sampled_tran))
            loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal)
        elif self._n_objects == 2:
            (obs_tau_excludes_goal, obs_tau_achieved_goal_1,
             obs_tau_achieved_goal_2
             ) = self._split_observation_fn(feature_tau_sampled_tran)
            loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1)
            loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2)
            loss = loss_1 + loss_2

        neg_loss = -loss
        neg_loss_scalar = tf.reduce_mean(neg_loss)
        return LossInfo(scalar_loss=neg_loss_scalar)
    def get_training_exps(self):
        """
        Get training experiences from the learning queue

        Returns:
            exp (Experience):
            env_id (tf.tensor): if not None, has the shape of (`num_envs`). Each
                element of `env_ids` indicates which batched env the data come from.
            steps (int): how many environment steps this batch of exps contain
        """
        batch = make_learning_batch(*self._tfq.learn_queue.dequeue_all())
        # convert the batch to the experience format
        exp = make_experience(batch.time_step, batch.policy_step,
                              batch.act_dist_param)
        # make the exp batch major for each environment
        exp = tf.nest.map_structure(lambda e: common.transpose2(e, 1, 2), exp)
        num_envs, unroll_length, env_batch_size \
            = batch.time_step.observation.shape[:3]
        steps = num_envs * unroll_length * env_batch_size
        return exp, batch.env_id, steps
Exemplo n.º 5
0
 def _make_time_major(nest):
     """Put the time dim to axis=0."""
     return tf.nest.map_structure(lambda x: common.transpose2(x, 0, 1),
                                  nest)