Exemplo n.º 1
0
    def postprocess_trajectory(self, policy, sample_batch, tf_sess=None):
        """Calculates phi values (obs, obs', and predicted obs') and ri.

        Also calculates forward and inverse losses and updates the curiosity
        module on the provided batch using our optimizer.
        """
        # Push both observations through feature net to get both phis.
        phis, _ = self.model._curiosity_feature_net({
            SampleBatch.OBS:
            torch.cat([
                torch.from_numpy(sample_batch[SampleBatch.OBS]),
                torch.from_numpy(sample_batch[SampleBatch.NEXT_OBS])
            ])
        })
        phi, next_phi = torch.chunk(phis, 2)
        actions_tensor = torch.from_numpy(
            sample_batch[SampleBatch.ACTIONS]).long().to(policy.device)

        # Predict next phi with forward model.
        predicted_next_phi = self.model._curiosity_forward_fcnet(
            torch.cat(
                [phi, one_hot(actions_tensor, self.action_space).float()],
                dim=-1))

        # Forward loss term (predicted phi', given phi and action vs actually
        # observed phi').
        forward_l2_norm_sqared = 0.5 * torch.sum(
            torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1)
        forward_loss = torch.mean(forward_l2_norm_sqared)

        # Scale intrinsic reward by eta hyper-parameter.
        sample_batch[SampleBatch.REWARDS] = \
            sample_batch[SampleBatch.REWARDS] + \
            self.eta * forward_l2_norm_sqared.detach().cpu().numpy()

        # Inverse loss term (prediced action that led from phi to phi' vs
        # actual action taken).
        phi_cat_next_phi = torch.cat([phi, next_phi], dim=-1)
        dist_inputs = self.model._curiosity_inverse_fcnet(phi_cat_next_phi)
        action_dist = TorchCategorical(dist_inputs, self.model) if \
            isinstance(self.action_space, Discrete) else \
            TorchMultiCategorical(
                dist_inputs, self.model, self.action_space.nvec)
        # Neg log(p); p=probability of observed action given the inverse-NN
        # predicted action distribution.
        inverse_loss = -action_dist.logp(actions_tensor)
        inverse_loss = torch.mean(inverse_loss)

        # Calculate the ICM loss.
        loss = (1.0 - self.beta) * inverse_loss + self.beta * forward_loss
        # Perform an optimizer step.
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

        # Return the postprocessed sample batch (with the corrected rewards).
        return sample_batch
Exemplo n.º 2
0
    def custom_loss(self, policy_loss, loss_inputs):
        """Calculates a custom loss on top of the given policy_loss(es).

        Args:
            policy_loss (List[TensorType]): The list of already calculated
                policy losses (as many as there are optimizers).
            loss_inputs: Struct of np.ndarrays holding the
                entire train batch.

        Returns:
            List[TensorType]: The altered list of policy losses. In case the
                custom loss should have its own optimizer, make sure the
                returned list is one larger than the incoming policy_loss list.
                In case you simply want to mix in the custom loss into the
                already calculated policy losses, return a list of altered
                policy losses (as done in this example below).
        """
        # Get the next batch from our input files.
        batch = self.reader.next()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            torch.from_numpy(batch["obs"]).float().to(policy_loss[0].device),
            self.obs_space,
            tensorlib="torch",
        )
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        imitation_loss = torch.mean(
            -action_dist.logp(
                torch.from_numpy(batch["actions"]).to(policy_loss[0].device)
            )
        )
        self.imitation_loss_metric = imitation_loss.item()
        self.policy_loss_metric = np.mean([loss.item() for loss in policy_loss])

        # Add the imitation loss to each already calculated policy loss term.
        # Alternatively (if custom loss has its own optimizer):
        # return policy_loss + [10 * self.imitation_loss]
        return [loss_ + 10 * imitation_loss for loss_ in policy_loss]
Exemplo n.º 3
0
    def get_exploration_loss(self, policy_loss, train_batch: SampleBatchType):
        """Adds the loss for the inverse and forward models to policy_loss.
        """
        batch_size = train_batch[SampleBatch.OBS].shape[0]
        phis, _ = self.model._curiosity_feature_net({
            SampleBatch.OBS: torch.cat(
                [
                    train_batch[SampleBatch.OBS],
                    train_batch[SampleBatch.NEXT_OBS]
                ],
                dim=0)
        })
        phi, next_phi = phis[:batch_size], phis[batch_size:]
        # Inverse loss term (prediced action that led from phi to phi' vs
        # actual action taken).
        phi_next_phi = torch.cat([phi, next_phi], dim=-1)
        dist_inputs = self.model._curiosity_inverse_fcnet(phi_next_phi)
        action_dist = TorchCategorical(dist_inputs, self.model)
        # Neg log(p); p=probability of observed action given the inverse-NN
        # predicted action distribution.
        inverse_loss = -action_dist.logp(train_batch[SampleBatch.ACTIONS])
        inverse_loss = torch.mean(inverse_loss)

        # Forward loss term has already been calculated during train batch pre-
        # processing (just have to weight with beta here).
        predicted_next_phi = self.model._curiosity_forward_fcnet(
            torch.cat(
                [
                    phi,
                    F.one_hot(
                        train_batch[SampleBatch.ACTIONS].long(),
                        num_classes=self.action_space.n).float()
                ],
                dim=-1))
        forward_loss = torch.mean(0.5 * torch.sum(
            torch.pow(predicted_next_phi - next_phi, 2.0), dim=-1))

        # Append our loss to the policy loss(es).
        return policy_loss + [
            (1.0 - self.beta) * inverse_loss + self.beta * forward_loss
        ]
Exemplo n.º 4
0
    def compute_actions_from_input_dict(
        self,
        input_dict: Dict[str, TensorType],
        explore: bool = None,
        timestep: Optional[int] = None,
        **kwargs,
    ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:

        obs_batch = input_dict[SampleBatch.OBS]
        state_batches = []
        i = 0
        while f"state_in_{i}" in input_dict:
            state_batches.append(input_dict[f"state_in_{i}"])
            i += 1

        explore = explore if explore is not None else self.config["explore"]
        obs_batch, action_mask, _ = self._unpack_observation(obs_batch)
        # We need to ensure we do not use the env global state
        # to compute actions

        # Compute actions
        with torch.no_grad():
            q_values, hiddens = _mac(
                self.model,
                torch.as_tensor(obs_batch,
                                dtype=torch.float,
                                device=self.device),
                [
                    torch.as_tensor(
                        np.array(s), dtype=torch.float, device=self.device)
                    for s in state_batches
                ],
            )
            avail = torch.as_tensor(action_mask,
                                    dtype=torch.float,
                                    device=self.device)
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            masked_q_values_folded = torch.reshape(
                masked_q_values, [-1] + list(masked_q_values.shape)[2:])
            actions, _ = self.exploration.get_exploration_action(
                action_distribution=TorchCategorical(masked_q_values_folded),
                timestep=timestep,
                explore=explore,
            )
            actions = (torch.reshape(
                actions,
                list(masked_q_values.shape)[:-1]).cpu().numpy())
            hiddens = [s.cpu().numpy() for s in hiddens]

        return tuple(actions.transpose([1, 0])), hiddens, {}
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.input_files)
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = torch.mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
Exemplo n.º 6
0
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        explore=None,
                        timestep=None,
                        **kwargs):
        explore = explore if explore is not None else self.config["explore"]
        obs_batch, action_mask, _ = self._unpack_observation(obs_batch)
        # We need to ensure we do not use the env global state
        # to compute actions

        # Compute actions
        with torch.no_grad():
            q_values, hiddens = _mac(
                self.model,
                torch.as_tensor(obs_batch,
                                dtype=torch.float,
                                device=self.device),
                [
                    torch.as_tensor(
                        np.array(s), dtype=torch.float, device=self.device)
                    for s in state_batches
                ],
            )
            avail = torch.as_tensor(action_mask,
                                    dtype=torch.float,
                                    device=self.device)
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            masked_q_values_folded = torch.reshape(
                masked_q_values, [-1] + list(masked_q_values.shape)[2:])
            actions, _ = self.exploration.get_exploration_action(
                action_distribution=TorchCategorical(masked_q_values_folded),
                timestep=timestep,
                explore=explore,
            )
            actions = (torch.reshape(
                actions,
                list(masked_q_values.shape)[:-1]).cpu().numpy())
            hiddens = [s.cpu().numpy() for s in hiddens]

        return tuple(actions.transpose([1, 0])), hiddens, {}
Exemplo n.º 7
0
 def compute_actions(
     self,
     *,
     input_dict,
     explore=True,
     timestep=None,
     episodes=None,
     is_training=False,
     **kwargs
 ) -> Tuple[TensorStructType, List[TensorType], Dict[str,
                                                     TensorStructType]]:
     if timestep is None:
         timestep = self.global_timestep
     # Compute the Q-values for each possible action, using our Q-value network.
     q_vals = self._compute_q_values(self.model,
                                     input_dict[SampleBatch.OBS],
                                     is_training=is_training)
     # Use a Categorical distribution for the exploration component.
     # This way, it may either sample storchastically (e.g. when using SoftQ)
     # or deterministically/greedily (e.g. when using EpsilonGreedy).
     distribution = TorchCategorical(q_vals, self.model)
     # Call the exploration component's `get_exploration_action` method to
     # explore, if necessary.
     actions, logp = self.exploration.get_exploration_action(
         action_distribution=distribution,
         timestep=timestep,
         explore=explore)
     # Return (exploration) actions, state_outs (empty list), and extra outs.
     return (
         actions,
         [],
         {
             "q_values": q_vals,
             SampleBatch.ACTION_LOGP: logp,
             SampleBatch.ACTION_PROB: torch.exp(logp),
             SampleBatch.ACTION_DIST_INPUTS: q_vals,
         },
     )
Exemplo n.º 8
0
 def set_temperature_and_get_args(self, temperature, inputs):
     action_dist_class = TorchCategorical
     action_distribution = TorchCategorical(
         inputs, self.softqschedule.model, temperature=1.0)
     self.softqschedule.temperature = temperature
     return action_distribution, action_dist_class
 def _a2_distribution(self, a1):
     a1_vec = torch.unsqueeze(a1.float(), 1)
     _, a2_logits = self.model.action_module(self.inputs, a1_vec)
     a2_dist = TorchCategorical(a2_logits)
     return a2_dist
 def _a1_distribution(self):
     BATCH = self.inputs.shape[0]
     zeros = torch.zeros((BATCH, 1)).to(self.inputs.device)
     a1_logits, _ = self.model.action_module(self.inputs, zeros)
     a1_dist = TorchCategorical(a1_logits)
     return a1_dist
 def logp(self, actions):
     a1, a2 = actions[:, 0], actions[:, 1]
     a1_vec = torch.unsqueeze(a1.float(), 1)
     a1_logits, a2_logits = self.model.action_module(self.inputs, a1_vec)
     return (TorchCategorical(a1_logits).logp(a1) +
             TorchCategorical(a2_logits).logp(a2))
Exemplo n.º 12
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True])
        }

        # tf.
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        vars = policy.model.trainable_variables()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
        # [2.9701, 1.99, 1.0]
        train_batch = pg.post_process_advantages(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

        # Actual loss results.
        results = pg.pg_tf_loss(policy,
                                policy.model,
                                dist_class=Categorical,
                                train_batch=train_batch)

        # Calculate expected results.
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[2].numpy(), vars[3].numpy())
        expected_logp = Categorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp * train_batch[Postprocessing.ADVANTAGES])
        check(results.numpy(), expected_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = policy._lazy_tensor_dict(train_batch)
        results = pg.pg_torch_loss(policy,
                                   policy.model,
                                   dist_class=TorchCategorical,
                                   train_batch=train_batch)
        expected_logits = policy.model.last_output()
        expected_logp = TorchCategorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp.detach().numpy() *
            train_batch[Postprocessing.ADVANTAGES].numpy())
        check(results.detach().numpy(), expected_loss, decimals=4)