예제 #1
0
    def postprocess_trajectory(self, policy, sample_batch, tf_sess=None):
        """Calculates phi values (obs, obs', and predicted obs') and ri.

        Also calculates forward and inverse losses and updates the curiosity
        module on the provided batch using our optimizer.
        """
        # Push both observations through feature net to get both phis.
        phis, _ = self.model._curiosity_feature_net({
            SampleBatch.OBS:
            torch.cat([
                torch.from_numpy(sample_batch[SampleBatch.OBS]),
                torch.from_numpy(sample_batch[SampleBatch.NEXT_OBS])
            ])
        })
        phi, next_phi = torch.chunk(phis, 2)
        actions_tensor = torch.from_numpy(
            sample_batch[SampleBatch.ACTIONS]).long().to(policy.device)

        # Predict next phi with forward model.
        predicted_next_phi = self.model._curiosity_forward_fcnet(
            torch.cat(
                [phi, one_hot(actions_tensor, self.action_space).float()],
                dim=-1))

        # Forward loss term (predicted phi', given phi and action vs actually
        # observed phi').
        forward_l2_norm_sqared = 0.5 * torch.sum(
            torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1)
        forward_loss = torch.mean(forward_l2_norm_sqared)

        # Scale intrinsic reward by eta hyper-parameter.
        sample_batch[SampleBatch.REWARDS] = \
            sample_batch[SampleBatch.REWARDS] + \
            self.eta * forward_l2_norm_sqared.detach().cpu().numpy()

        # Inverse loss term (prediced action that led from phi to phi' vs
        # actual action taken).
        phi_cat_next_phi = torch.cat([phi, next_phi], dim=-1)
        dist_inputs = self.model._curiosity_inverse_fcnet(phi_cat_next_phi)
        action_dist = TorchCategorical(dist_inputs, self.model) if \
            isinstance(self.action_space, Discrete) else \
            TorchMultiCategorical(
                dist_inputs, self.model, self.action_space.nvec)
        # Neg log(p); p=probability of observed action given the inverse-NN
        # predicted action distribution.
        inverse_loss = -action_dist.logp(actions_tensor)
        inverse_loss = torch.mean(inverse_loss)

        # Calculate the ICM loss.
        loss = (1.0 - self.beta) * inverse_loss + self.beta * forward_loss
        # Perform an optimizer step.
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

        # Return the postprocessed sample batch (with the corrected rewards).
        return sample_batch
예제 #2
0
    def custom_loss(self, policy_loss, loss_inputs):
        """Calculates a custom loss on top of the given policy_loss(es).

        Args:
            policy_loss (List[TensorType]): The list of already calculated
                policy losses (as many as there are optimizers).
            loss_inputs: Struct of np.ndarrays holding the
                entire train batch.

        Returns:
            List[TensorType]: The altered list of policy losses. In case the
                custom loss should have its own optimizer, make sure the
                returned list is one larger than the incoming policy_loss list.
                In case you simply want to mix in the custom loss into the
                already calculated policy losses, return a list of altered
                policy losses (as done in this example below).
        """
        # Get the next batch from our input files.
        batch = self.reader.next()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            torch.from_numpy(batch["obs"]).float().to(policy_loss[0].device),
            self.obs_space,
            tensorlib="torch",
        )
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        imitation_loss = torch.mean(
            -action_dist.logp(
                torch.from_numpy(batch["actions"]).to(policy_loss[0].device)
            )
        )
        self.imitation_loss_metric = imitation_loss.item()
        self.policy_loss_metric = np.mean([loss.item() for loss in policy_loss])

        # Add the imitation loss to each already calculated policy loss term.
        # Alternatively (if custom loss has its own optimizer):
        # return policy_loss + [10 * self.imitation_loss]
        return [loss_ + 10 * imitation_loss for loss_ in policy_loss]
예제 #3
0
파일: curiosity.py 프로젝트: yonkshi/ray
    def get_exploration_loss(self, policy_loss, train_batch: SampleBatchType):
        """Adds the loss for the inverse and forward models to policy_loss.
        """
        batch_size = train_batch[SampleBatch.OBS].shape[0]
        phis, _ = self.model._curiosity_feature_net({
            SampleBatch.OBS: torch.cat(
                [
                    train_batch[SampleBatch.OBS],
                    train_batch[SampleBatch.NEXT_OBS]
                ],
                dim=0)
        })
        phi, next_phi = phis[:batch_size], phis[batch_size:]
        # Inverse loss term (prediced action that led from phi to phi' vs
        # actual action taken).
        phi_next_phi = torch.cat([phi, next_phi], dim=-1)
        dist_inputs = self.model._curiosity_inverse_fcnet(phi_next_phi)
        action_dist = TorchCategorical(dist_inputs, self.model)
        # Neg log(p); p=probability of observed action given the inverse-NN
        # predicted action distribution.
        inverse_loss = -action_dist.logp(train_batch[SampleBatch.ACTIONS])
        inverse_loss = torch.mean(inverse_loss)

        # Forward loss term has already been calculated during train batch pre-
        # processing (just have to weight with beta here).
        predicted_next_phi = self.model._curiosity_forward_fcnet(
            torch.cat(
                [
                    phi,
                    F.one_hot(
                        train_batch[SampleBatch.ACTIONS].long(),
                        num_classes=self.action_space.n).float()
                ],
                dim=-1))
        forward_loss = torch.mean(0.5 * torch.sum(
            torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1))

        # Append our loss to the policy loss(es).
        return policy_loss + [
            (1.0 - self.beta) * inverse_loss + self.beta * forward_loss
        ]
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.input_files)
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = torch.mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss