예제 #1
0
 def policy(self, states):
     if isinstance(self.trainer, DQNTrainer):
         input = states
     elif isinstance(self.trainer, ParametricDQNTrainer):
         num_actions = len(self.trainer.action_normalization_parameters)
         actions = np.eye(num_actions, dtype=np.float32)
         actions = np.tile(actions, reps=(len(states), 1))
         states = np.repeat(states, repeats=num_actions, axis=0)
         input = np.hstack((states, actions))
     else:
         raise NotImplementedError("Invalid trainer passed to GymPredictor")
     q_scores = self.trainer.internal_prediction(input)
     if isinstance(self.trainer, DQNTrainer):
         assert q_scores.shape[0] == 1
         q_scores = q_scores[0]
     q_scores_softmax = softmax(
         torch.from_numpy(q_scores.reshape(1, -1)), self.trainer.rl_temperature
     ).numpy()[0]
     if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
         q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
     policies = [
         np.argmax(q_scores),
         np.random.choice(q_scores.shape[0], p=q_scores_softmax),
     ]
     return policies
예제 #2
0
 def policy(self, states):
     if isinstance(self.trainer, DQNTrainer):
         input = [states]
     elif isinstance(self.trainer, ParametricDQNTrainer):
         num_actions = self.action_dim
         actions = np.eye(num_actions, dtype=np.float32)
         actions = np.tile(actions, reps=(len(states), 1))
         states = np.repeat(states, repeats=num_actions, axis=0)
         input = (states, actions)
     else:
         raise NotImplementedError("Invalid trainer passed to GymPredictor")
     q_scores = self.trainer.internal_prediction(*input)
     if isinstance(self.trainer, DQNTrainer):
         assert q_scores.shape[0] == 1
         q_scores = q_scores[0]
     q_scores_softmax = softmax(torch.from_numpy(q_scores.reshape(1, -1)),
                                self.trainer.rl_temperature).numpy()[0]
     if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
         q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
     policies = [
         np.argmax(q_scores),
         np.random.choice(q_scores.shape[0], p=q_scores_softmax),
     ]
     return policies
예제 #3
0
    def train(self, training_samples: TrainingDataPage):

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert (training_samples.states.shape[0] == self.minibatch_size
                    ), "Invalid shape: " + str(training_samples.states.shape)
            assert training_samples.actions.shape == torch.Size([
                self.minibatch_size, len(self._actions)
            ]), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.next_states.shape ==
                    training_samples.states.shape), "Invalid shape: " + str(
                        training_samples.next_states.shape)
            assert (training_samples.not_terminal.shape ==
                    training_samples.rewards.shape), "Invalid shape: " + str(
                        training_samples.not_terminal.shape)
            if training_samples.possible_next_actions_mask is not None:
                assert (
                    training_samples.possible_next_actions_mask.shape ==
                    training_samples.actions.shape), (
                        "Invalid shape: " +
                        str(training_samples.possible_next_actions_mask.shape))
            if training_samples.propensities is not None:
                assert (training_samples.propensities.shape == training_samples
                        .rewards.shape), "Invalid shape: " + str(
                            training_samples.propensities.shape)
            if training_samples.metrics is not None:
                assert (
                    training_samples.metrics.shape[0] == self.minibatch_size
                ), "Invalid shape: " + str(training_samples.metrics.shape)

        boosted_rewards = self.boost_rewards(training_samples.rewards,
                                             training_samples.actions)

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminal

        if self.use_seq_num_diff_as_time_diff:
            time_diff = training_samples.time_diffs / self.time_diff_unit_length
            discount_tensor = discount_tensor.pow(time_diff)

        all_next_q_values, all_next_q_values_target = self.get_detached_q_values(
            training_samples.next_states)

        if self.bcq:
            # Batch constrained q-learning
            on_policy_actions = self.bcq_imitator(training_samples.next_states)
            on_policy_action_probs = softmax(on_policy_actions, temperature=1)
            filter_values = (
                on_policy_action_probs /
                on_policy_action_probs.max(keepdim=True, dim=1)[0])
            action_on_policy = (filter_values >=
                                self.bcq_drop_threshold).float()
            training_samples.possible_next_actions_mask *= action_on_policy
        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            next_q_values, max_q_action_idxs = self.get_max_q_values_with_target(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.possible_next_actions_mask,
            )
        else:
            # SARSA
            next_q_values, max_q_action_idxs = self.get_max_q_values_with_target(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.next_actions,
            )

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor *
                                         filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = all_q_values.detach()
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        if self.clip_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(self.q_network.parameters(),
                                           self.clip_grad_norm)
        self.q_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        bcq_loss = None
        if self.bcq:
            # Batch constrained q-learning
            action_preds = self.bcq_imitator(states)
            imitator_loss = torch.nn.CrossEntropyLoss()
            # Classification label is index of action with value 1
            bcq_loss = imitator_loss(action_preds,
                                     torch.max(actions, dim=1)[1])
            self.bcq_imitator_optimizer.zero_grad()
            bcq_loss.backward()
            self.bcq_imitator_optimizer.step()

        logged_action_idxs = actions.argmax(dim=1, keepdim=True)
        reward_loss, model_rewards, model_propensities = self.calculate_cpes(
            training_samples,
            states,
            logged_action_idxs,
            max_q_action_idxs,
            discount_tensor,
            not_done_mask,
        )

        self.loss_reporter.report(
            td_loss=self.loss,
            imitator_loss=bcq_loss,
            reward_loss=reward_loss,
            logged_actions=logged_action_idxs,
            logged_propensities=training_samples.propensities,
            logged_rewards=rewards,
            logged_values=None,  # Compute at end of each epoch for CPE
            model_propensities=model_propensities,
            model_rewards=model_rewards,
            model_values=self.all_action_scores,
            model_values_on_logged_actions=
            None,  # Compute at end of each epoch for CPE
            model_action_idxs=self.get_max_q_values(
                self.all_action_scores,
                training_samples.possible_actions_mask)[1],
        )