def gather_static_shared_stats(
            self, evaluation_dataset_as_transitions: List[Transition],
            batch_size: int, reward_model: Architecture,
            network_keys: List) -> None:
        all_reward_model_rewards = []
        all_old_policy_probs = []
        all_rewards = []
        all_actions = []

        for i in range(
                math.ceil(len(evaluation_dataset_as_transitions) /
                          batch_size)):
            batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) *
                                                      batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

        self.all_reward_model_rewards = np.concatenate(
            all_reward_model_rewards, axis=0)
        self.all_old_policy_probs = np.concatenate(all_old_policy_probs,
                                                   axis=0)
        self.all_rewards = np.concatenate(all_rewards, axis=0)
        self.all_actions = np.concatenate(all_actions, axis=0)

        # mark that static shared data was collected and ready to be used
        self.is_gathered_static_shared_data = True
Пример #2
0
    def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition],
                                  batch_size: int, reward_model: Architecture,
                                  q_network: Architecture,
                                  network_keys: List) -> OpeSharedStats:
        """
        Do the preparations needed for different estimators.
        Some of the calcuations are shared, so we centralize all the work here.

        :param dataset_as_transitions: The evaluation dataset in the form of transitions.
        :param batch_size: The batch size to use.
        :param reward_model: A reward model to be used by DR
        :param q_network: The Q network whose its policy we evaluate.
        :param network_keys: The network keys used for feeding the neural networks.
        :return:
        """
        # IPS
        all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], []
        all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], []

        for i in range(math.ceil(len(dataset_as_transitions) / batch_size)):
            batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))

            # we always use the first Q head to calculate OPEs. might want to change this in the future.
            # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs.
            q_values, sm_values = q_network.predict(
                batch_for_inference.states(network_keys),
                outputs=[
                    q_network.output_heads[0].q_values,
                    q_network.output_heads[0].softmax
                ])

            all_policy_probs.append(sm_values)
            all_v_values_reward_model_based.append(
                np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1],
                       axis=1))
            all_v_values_q_model_based.append(
                np.sum(all_policy_probs[-1] * q_values, axis=1))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

            for j, t in enumerate(batch):
                t.update_info({
                    'q_value':
                    q_values[j],
                    'softmax_policy_prob':
                    all_policy_probs[-1][j],
                    'v_value_q_model_based':
                    all_v_values_q_model_based[-1][j],
                })

        all_reward_model_rewards = np.concatenate(all_reward_model_rewards,
                                                  axis=0)
        all_policy_probs = np.concatenate(all_policy_probs, axis=0)
        all_v_values_reward_model_based = np.concatenate(
            all_v_values_reward_model_based, axis=0)
        all_rewards = np.concatenate(all_rewards, axis=0)
        all_actions = np.concatenate(all_actions, axis=0)
        all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0)

        # generate model probabilities
        new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]),
                                           all_actions]
        rho_all_dataset = new_policy_prob / all_old_policy_probs

        return OpeSharedStats(all_reward_model_rewards, all_policy_probs,
                              all_v_values_reward_model_based, all_rewards,
                              all_actions, all_old_policy_probs,
                              new_policy_prob, rho_all_dataset)
Пример #3
0
    def train_policy_network(self, dataset, epochs):
        loss = []
        for j in range(epochs):
            loss = {
                'total_loss': [],
                'policy_losses': [],
                'unclipped_grads': [],
                'fetch_result': []
            }
            #shuffle(dataset)
            for i in range(
                    len(dataset) //
                    self.ap.network_wrappers['actor'].batch_size):
                batch = Batch(
                    dataset[i *
                            self.ap.network_wrappers['actor'].batch_size:(i +
                                                                          1) *
                            self.ap.network_wrappers['actor'].batch_size])

                network_keys = self.ap.network_wrappers[
                    'actor'].input_embedders_parameters.keys()

                advantages = batch.info('advantage')
                actions = batch.actions()
                if not isinstance(self.spaces.action,
                                  DiscreteActionSpace) and len(
                                      actions.shape) == 1:
                    actions = np.expand_dims(actions, -1)

                # get old policy probabilities and distribution
                old_policy = force_list(
                    self.networks['actor'].target_network.predict(
                        batch.states(network_keys)))

                # calculate gradients and apply on both the local policy network and on the global policy network
                fetches = [
                    self.networks['actor'].online_network.output_heads[0].
                    kl_divergence, self.networks['actor'].online_network.
                    output_heads[0].entropy
                ]

                inputs = copy.copy(batch.states(network_keys))
                inputs['output_0_0'] = actions

                # old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
                # it has just a mean. otherwise, it has both a mean and standard deviation
                for input_index, input in enumerate(old_policy):
                    inputs['output_0_{}'.format(input_index + 1)] = input

                total_loss, policy_losses, unclipped_grads, fetch_result =\
                    self.networks['actor'].online_network.accumulate_gradients(
                        inputs, [advantages], additional_fetches=fetches)

                self.networks['actor'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['actor'].apply_gradients_to_global_network()

                self.networks[
                    'actor'].online_network.reset_accumulated_gradients()

                loss['total_loss'].append(total_loss)
                loss['policy_losses'].append(policy_losses)
                loss['unclipped_grads'].append(unclipped_grads)
                loss['fetch_result'].append(fetch_result)

                self.unclipped_grads.add_sample(unclipped_grads)

            for key in loss.keys():
                loss[key] = np.mean(loss[key], 0)

            if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
                curr_learning_rate = self.networks[
                    'critic'].online_network.get_variable_value(
                        self.ap.learning_rate)
                self.curr_learning_rate.add_sample(curr_learning_rate)
            else:
                curr_learning_rate = self.ap.network_wrappers[
                    'critic'].learning_rate

            # log training parameters
            screen.log_dict(OrderedDict([
                ("Surrogate loss", loss['policy_losses'][0]),
                ("KL divergence", loss['fetch_result'][0]),
                ("Entropy", loss['fetch_result'][1]), ("training epoch", j),
                ("learning_rate", curr_learning_rate)
            ]),
                            prefix="Policy training")

        self.total_kl_divergence_during_training_process = loss[
            'fetch_result'][0]
        self.entropy.add_sample(loss['fetch_result'][1])
        self.kl_divergence.add_sample(loss['fetch_result'][0])
        return loss['total_loss']