Пример #1
0
    def train_step(self, inputs, state=None):
        """Train one step.

        Args:
            inputs (tuple): tuple of (inputs, target)
            state (nested Tensor): network state for `decoder`

        Returns:
            AlgorithmStep with the following fields:
            outputs: decoding result
            state: rnn state
            info: loss of decoding

        """
        input, target = inputs
        pred, state = self._decoder(input, network_state=state)
        assert pred.shape == target.shape
        loss = self._loss(target, pred)

        if len(loss.shape) > 1:
            # reduce to (B,)
            reduce_dims = list(range(1, len(loss.shape)))
            loss = tf.reduce_sum(loss, axis=reduce_dims)
        return AlgorithmStep(outputs=pred,
                             state=state,
                             info=LossInfo(loss=self._loss_weight * loss))
Пример #2
0
    def train_step(self, inputs, loss_func, batch_size=None, state=None):
        """
        Args:
            inputs (nested Tensor): if None, the outputs is generated only from
                noise.
            loss_func (Callable): loss_func([outputs, inputs])
                (loss_func(outputs) if inputs is None) returns a Tensor with
                shape [batch_size] as a loss for optimizing the generator
            batch_size (int): batch_size. Must be provided if inputs is None.
                Its is ignored if inputs is not None
            state: not used
        Returns:
            AlgorithmStep:
                outputs: Tensor with shape (batch_size, dim)
                info: LossInfo
        """
        outputs, gen_inputs = self._predict(inputs, batch_size)
        loss, grad = self._grad_func(inputs, outputs, loss_func)
        loss_propagated = tf.reduce_sum(
            tf.stop_gradient(grad) * outputs, axis=-1)

        mi_loss = ()
        if self._mi_estimator is not None:
            mi_step = self._mi_estimator.train_step([gen_inputs, outputs])
            mi_loss = mi_step.info.loss
            loss_propagated += self._mi_weight * mi_loss

        return AlgorithmStep(
            outputs=outputs,
            state=(),
            info=LossInfo(
                loss=loss_propagated,
                extra=GeneratorLossInfo(generator=loss, mi_estimator=mi_loss)))
Пример #3
0
    def train_step(self, distribution):
        """Train step.

        Args:
            distribution (nested Distribution): action distribution from the
                policy.
        Returns:
            AlgorithmStep. `info` field is LossInfo, other fields are empty.
        """
        entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
            distribution, self._action_spec)
        alpha_loss = self._log_alpha * tf.stop_gradient(entropy -
                                                        self._target_entropy)
        alpha = tf.stop_gradient(tf.exp(self._log_alpha))
        loss = alpha_loss
        entropy_loss = -entropy

        # Joint loss for optimizing alpha and entropy. The effect of alpha_loss
        # is to increase alpha when entropy is lower than target and decrease
        # alpha when entropy is larger than target. alpha * entropy_for_gradient
        # is to encourage higher action entropy.
        loss -= alpha * entropy_for_gradient

        return AlgorithmStep(
            outputs=(),
            state=(),
            info=LossInfo(
                loss,
                extra=EntropyTargetLossInfo(
                    alpha_loss=alpha_loss, entropy_loss=entropy_loss)))
Пример #4
0
    def train_step(self, inputs, state, calc_intrinsic_reward=True):
        """
        Args:
            inputs (tuple): observation
            state (tuple):  empty tuple ()
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: empty tuple ()
                info: RNDInfo
        """
        observation, _ = inputs
        if self._observation_normalizer is not None:
            observation = self._observation_normalizer.normalize(observation)

        pred_embedding, _ = self._predictor_net(observation)
        target_embedding, _ = self._target_net(observation)

        loss = 0.5 * tf.reduce_mean(
            tf.square(pred_embedding - tf.stop_gradient(target_embedding)),
            axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(outputs=(),
                             state=(),
                             info=RNDInfo(reward=intrinsic_reward,
                                          loss=LossInfo(loss=loss)))
Пример #5
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data
            state (tuple): state for MISC (previous observation,
                previous previous action)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: tuple of observation and previous action
                info: (MISCInfo):
        """
        feature = time_step.observation
        prev_action = time_step.prev_action
        feature = tf.concat([feature_state, prev_action], axis=-1)
        prev_feature = tf.concat(state, axis=-1)

        feature_reshaped = tf.expand_dims(feature, axis=1)
        prev_feature_reshaped = tf.expand_dims(prev_feature, axis=1)
        feature_pair = tf.concat([prev_feature_reshaped, feature_reshaped], 1)
        feature_reshaped_tran = transpose2(feature_reshaped, 1, 0)

        def add_batch():
            self._buffer.add_batch(feature_reshaped_tran)

        if calc_intrinsic_reward:
            add_batch()

        if self._n_objects < 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal = \
                self._split_observation_fn(feature_pair)
            loss = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal)
        elif self._n_objects == 2:
            obs_tau_excludes_goal, obs_tau_achieved_goal_1, obs_tau_achieved_goal_2 \
            = self._split_observation_fn(
                feature_pair)
            loss_1 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_1)
            loss_2 = self._mine(obs_tau_excludes_goal, obs_tau_achieved_goal_2)
            loss = loss_1 + loss_2

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            # scale/normalize the MISC intrinsic reward
            if self._n_objects < 2:
                intrinsic_reward = tf.clip_by_value(self._mi_r_scale * loss, 0,
                                                    1)
            elif self._n_objects == 2:
                intrinsic_reward = tf.clip_by_value(
                    self._mi_r_scale * loss_1, 0,
                    1) + 1 * tf.clip_by_value(self._mi_r_scale * loss_2, 0, 1)

        return AlgorithmStep(
            outputs=(), state=[feature_state, prev_action], \
            info=MISCInfo(reward=intrinsic_reward))
Пример #6
0
 def train_step(self, time_step: ActionTimeStep, state):
     """
     Args:
         time_step (ActionTimeStep): input data for dynamics learning
         state (Tensor): state for dynamics learning (previous observation)
     Returns:
         TrainStep:
             outputs: empty tuple ()
             state (DynamicsState): state for training
             info (DynamicsInfo):
     """
     return AlgorithmStep(outputs=(), state=(), info=())
Пример #7
0
 def predict(self, time_step: ActionTimeStep, state: DynamicsState):
     """Predict the next observation given the current time_step.
             The next step is predicted using the prev_action from time_step
             and the feature from state.
     """
     action = self._encode_action(time_step.prev_action)
     obs = state.feature
     forward_delta, network_state = self._dynamics_network(
         inputs=[obs, action], network_state=state.network)
     forward_pred = obs + forward_delta
     state = state._replace(feature=forward_pred, network=network_state)
     return AlgorithmStep(outputs=forward_pred, state=state, info=())
Пример #8
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data, where the
            observation is skill-augmened observation
            state (Tensor): state for DIAYN (previous skill)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: skill
                info (DIAYNInfo):
        """
        observations_aug = time_step.observation
        step_type = time_step.step_type
        observation, skill = observations_aug
        prev_skill = state

        if self._encoding_net is not None:
            feature, _ = self._encoding_net(observation)

        skill_pred, _ = self._discriminator_net(inputs=feature)

        skill_discriminate_loss = tf.nn.softmax_cross_entropy_with_logits(
            labels=prev_skill, logits=skill_pred)

        valid_masks = tf.cast(
            tf.not_equal(step_type, StepType.FIRST), tf.float32)
        skill_discriminate_loss = skill_discriminate_loss * valid_masks

        intrinsic_reward = ()

        if calc_intrinsic_reward:
            # use negative cross-entropy as reward
            # neglect neg-prior term as it is constant
            intrinsic_reward = tf.stop_gradient(-skill_discriminate_loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(
            outputs=(),
            state=skill,
            info=DIAYNInfo(
                reward=intrinsic_reward,
                loss=LossInfo(
                    loss=skill_discriminate_loss,
                    extra=dict(
                        skill_discriminate_loss=skill_discriminate_loss))))
Пример #9
0
    def train_step(self, inputs, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(Tensor, Tensor)): tuple of x and y
            state: not used
        Returns:
            AlgorithmStep
                outputs (Tensor): shape=[batch_size], its mean is the estimated
                    MI
                state: not used
                info (LossInfo): info.loss is the loss
        """
        x, y = inputs
        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        x1, y1 = self._sampler(x, y)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mean = tf.stop_gradient(tf.reduce_mean(ratio))
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = tf.stop_gradient(self._mean_averager.get())
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4)
            loss = -mi

        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgorithmStep(outputs=mi,
                             state=(),
                             info=LossInfo(loss, extra=()))
Пример #10
0
    def predict(self, inputs, batch_size=None, state=None):
        """Generate outputs given inputs.

        Args:
            inputs (nested Tensor): if None, the outputs is generated only from
                noise.
            batch_size (int): batch_size. Must be provided if inputs is None.
                Its is ignored if inputs is not None
            state: not used
        Returns:
            AlgorithmStep: outputs with shape (batch_size, output_dim)
        """
        outputs, _ = self._predict(inputs, batch_size, training=False)
        return AlgorithmStep(outputs=outputs, state=(), info=())
Пример #11
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data for ICM
            state (Tensor): state for ICM (previous observation)
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: observation
                info (ICMInfo):
        """
        feature = time_step.observation
        prev_action = time_step.prev_action

        if self._encoding_net is not None:
            feature, _ = self._encoding_net(feature)
        prev_feature = state
        prev_action = self._encode_action(prev_action)

        forward_pred, _ = self._forward_net(
            inputs=[tf.stop_gradient(prev_feature), prev_action])
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1)

        action_pred, _ = self._inverse_net(inputs=[prev_feature, feature])

        if tensor_spec.is_discrete(self._action_spec):
            inverse_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=prev_action, logits=action_pred)
        else:
            inverse_loss = 0.5 * tf.reduce_mean(
                tf.square(prev_action - action_pred), axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(forward_loss)
            intrinsic_reward = self._reward_normalizer.normalize(
                intrinsic_reward)

        return AlgorithmStep(
            outputs=(),
            state=feature,
            info=ICMInfo(reward=intrinsic_reward,
                         loss=LossInfo(loss=forward_loss + inverse_loss,
                                       extra=dict(forward_loss=forward_loss,
                                                  inverse_loss=inverse_loss))))
Пример #12
0
    def train_step(self, inputs, state: MBPState):
        """Train one step.

        Args:
            inputs (tuple): a tuple of (observation, action)
        """
        observation, _ = inputs
        latent_vector, kld, next_state = self.encode_step(inputs, state)

        # TODO: decoder for action
        decoder_loss = self.decode_step(latent_vector, observation)

        return AlgorithmStep(
            outputs=latent_vector,
            state=next_state,
            info=LossInfo(loss=self._loss_weight * (decoder_loss.loss + kld),
                          extra=MBPLossInfo(decoder=decoder_loss, vae=kld)))
Пример #13
0
    def train_step(self,
                   time_step: ActionTimeStep,
                   state,
                   calc_intrinsic_reward=True):
        """
        Args:
            time_step (ActionTimeStep): input time_step data
            state (tuple):  empty tuple ()
            calc_intrinsic_reward (bool): if False, only return the losses
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state: empty tuple ()
                info: ICMInfo
        """
        observation = time_step.observation

        if self._stacked_frames:
            # Assuming stacking in the last dim, we only keep the last frame.
            observation = observation[..., -1:]

        if self._observation_normalizer is not None:
            observation = self._observation_normalizer.normalize(observation)

        if self._encoder_net is not None:
            observation = tf.stop_gradient(self._encoder_net(observation)[0])

        pred_embedding, _ = self._predictor_net(observation)
        target_embedding, _ = self._target_net(observation)

        loss = tf.reduce_sum(
            tf.square(pred_embedding - tf.stop_gradient(target_embedding)),
            axis=-1)

        intrinsic_reward = ()
        if calc_intrinsic_reward:
            intrinsic_reward = tf.stop_gradient(loss)
            if self._reward_normalizer:
                intrinsic_reward = self._reward_normalizer.normalize(
                    intrinsic_reward, clip_value=self._reward_clip_value)

        return AlgorithmStep(
            outputs=(),
            state=(),
            info=ICMInfo(reward=intrinsic_reward, loss=LossInfo(loss=loss)))
Пример #14
0
    def train_step(self, distribution, step_type):
        """Train step.

        Args:
            distribution (nested Distribution): action distribution from the
                policy.
        Returns:
            AlgorithmStep. `info` field is LossInfo, other fields are empty.
        """
        entropy, entropy_for_gradient = dist_utils.entropy_with_fallback(
            distribution, self._action_spec)
        return AlgorithmStep(
            outputs=(),
            state=(),
            info=EntropyTargetInfo(
                step_type=step_type,
                loss=LossInfo(
                    loss=-entropy_for_gradient,
                    extra=EntropyTargetLossInfo(entropy_loss=-entropy))))
Пример #15
0
    def train_step(self, inputs, state=None):
        """Train one step.

        Args:
            inputs (tuple): tuple of (inputs, target)
            state (nested Tensor): network state for `decoder`

        Returns:
            AlgorithmStep with the following fields:
            outputs: decoding result
            state: rnn state
            info: loss of decoding

        """
        input, target = inputs
        pred, state = self._decoder(input, network_state=state)
        assert pred.shape == target.shape
        loss = self._loss(target, pred)
        return AlgorithmStep(outputs=pred,
                             state=state,
                             info=LossInfo(self._loss_weight * loss, extra=()))
Пример #16
0
    def train_step(self, inputs, state):
        """
        Args:
            inputs (tuple): observation and previous action
        Returns:
            TrainStep:
                outputs: intrinsic reward
                state:
                info:
        """
        feature, prev_action = inputs
        if self._encoding_net is not None:
            feature, _ = self._encoding_net(feature)
        prev_feature = state
        prev_action = self._encode_action(prev_action)

        forward_pred, _ = self._forward_net(
            inputs=[tf.stop_gradient(prev_feature), prev_action])
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1)

        action_pred, _ = self._inverse_net(inputs=[prev_feature, feature])

        if tensor_spec.is_discrete(self._action_spec):
            inverse_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=prev_action, logits=action_pred)
        else:
            inverse_loss = 0.5 * tf.reduce_mean(
                tf.square(prev_action - action_pred), axis=-1)

        intrinsic_reward = tf.stop_gradient(forward_loss)
        intrinsic_reward = self._reward_normalizer.normalize(intrinsic_reward)

        return AlgorithmStep(outputs=intrinsic_reward,
                             state=feature,
                             info=LossInfo(loss=forward_loss + inverse_loss,
                                           extra=ICMLossInfo(
                                               forward_loss=forward_loss,
                                               inverse_loss=inverse_loss)))
Пример #17
0
    def train_step(self, time_step: ActionTimeStep, state: DynamicsState):
        """
        Args:
            time_step (ActionTimeStep): input data for dynamics learning
            state (Tensor): state for dynamics learning (previous observation)
        Returns:
            TrainStep:
                outputs: empty tuple ()
                state (DynamicsState): state for training
                info (DynamicsInfo):
        """
        feature = time_step.observation
        dynamics_step = self.predict(time_step, state)
        forward_pred = dynamics_step.outputs
        forward_loss = 0.5 * tf.reduce_mean(
            tf.square(feature - forward_pred), axis=-1)

        info = DynamicsInfo(
            loss=LossInfo(
                loss=forward_loss, extra=dict(forward_loss=forward_loss)))
        state = DynamicsState(feature=feature)

        return AlgorithmStep(outputs=(), state=state, info=info)
Пример #18
0
 def _ml_step(self, x, y, y_distribution):
     pmi = self._ml_pmi(x, y, y_distribution)
     return AlgorithmStep(outputs=pmi, state=(), info=LossInfo(loss=-pmi))
Пример #19
0
    def train_step(self, inputs, y_distribution=None, state=None):
        """Perform training on one batch of inputs.

        Args:
            inputs (tuple(nested Tensor, nested Tensor)): tuple of x and y
            y_distribution (nested tfp.distributions.Distribution): distribution
                for the marginal distribution of y. If None, will use the
                sampling method `sampler` provided at constructor to generate
                the samples for the marginal distribution of Y.
            state: not used
        Returns:
            AlgorithmStep
                outputs (Tensor): shape=[batch_size], its mean is the estimated
                    MI for estimator 'KL', 'DV' and 'KLD', and Jensen-Shannon
                    divergence for estimator 'JSD'
                state: not used
                info (LossInfo): info.loss is the loss
        """
        x, y = inputs

        if self._type == 'ML':
            return self._ml_step(x, y, y_distribution)

        num_outer_dims = get_outer_rank(x, self._x_spec)
        batch_squash = BatchSquash(num_outer_dims)
        x = batch_squash.flatten(x)
        y = batch_squash.flatten(y)
        if y_distribution is None:
            x1, y1 = self._sampler(x, y)
        else:
            x1 = x
            y1 = y_distribution.sample()
            y1 = batch_squash.flatten(y1)

        log_ratio = self._model([x, y])[0]
        t1 = self._model([x1, y1])[0]

        if self._type == 'DV':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mean = tf.stop_gradient(tf.reduce_mean(ratio))
            if self._mean_averager:
                self._mean_averager.update(mean)
                unbiased_mean = tf.stop_gradient(self._mean_averager.get())
            else:
                unbiased_mean = mean
            # estimated MI = reduce_mean(mi)
            # ratio/mean-1 does not contribute to the final estimated MI, since
            # mean(ratio/mean-1) = 0. We add it so that we can have an estimation
            # of the variance of the MI estimator
            mi = log_ratio - (tf.math.log(mean) + ratio / mean - 1)
            loss = ratio / unbiased_mean - log_ratio
        elif self._type == 'KLD':
            ratio = tf.math.exp(tf.minimum(t1, 20))
            mi = log_ratio - ratio + 1
            loss = -mi
        elif self._type == 'JSD':
            mi = -tf.nn.softplus(-log_ratio) - tf.nn.softplus(t1) + math.log(4)
            loss = -mi
        mi = batch_squash.unflatten(mi)
        loss = batch_squash.unflatten(loss)

        return AlgorithmStep(
            outputs=mi, state=(), info=LossInfo(loss, extra=()))