def improve_reward_model(self, epochs: int):
        """
        Train a reward model to be used by the doubly-robust estimator

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """
        batch_size = self.ap.network_wrappers['reward_model'].batch_size
        network_keys = self.ap.network_wrappers[
            'reward_model'].input_embedders_parameters.keys()

        # this is fitted from the training dataset
        for epoch in range(epochs):
            loss = 0
            total_transitions_processed = 0
            for i, batch in enumerate(
                    self.call_memory('get_shuffled_data_generator',
                                     batch_size)):
                batch = Batch(batch)
                current_rewards_prediction_for_all_actions = self.networks[
                    'reward_model'].online_network.predict(
                        batch.states(network_keys))
                current_rewards_prediction_for_all_actions[
                    range(batch.size), batch.actions()] = batch.rewards()
                loss += self.networks['reward_model'].train_and_sync_networks(
                    batch.states(network_keys),
                    current_rewards_prediction_for_all_actions)[0]
                total_transitions_processed += batch.size

            log = OrderedDict()
            log['Epoch'] = epoch
            log['loss'] = loss / total_transitions_processed
            screen.log_dict(log, prefix='Training Reward Model')
예제 #2
0
    def get_reward_model_loss(self, batch: Batch):
        network_keys = self.ap.network_wrappers[
            'reward_model'].input_embedders_parameters.keys()
        current_rewards_prediction_for_all_actions = self.networks[
            'reward_model'].online_network.predict(batch.states(network_keys))
        current_rewards_prediction_for_all_actions[
            range(batch.size), batch.actions()] = batch.rewards()

        return self.networks['reward_model'].train_and_sync_networks(
            batch.states(network_keys),
            current_rewards_prediction_for_all_actions)[0]
    def gather_static_shared_stats(
            self, evaluation_dataset_as_transitions: List[Transition],
            batch_size: int, reward_model: Architecture,
            network_keys: List) -> None:
        all_reward_model_rewards = []
        all_old_policy_probs = []
        all_rewards = []
        all_actions = []

        for i in range(
                math.ceil(len(evaluation_dataset_as_transitions) /
                          batch_size)):
            batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) *
                                                      batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

        self.all_reward_model_rewards = np.concatenate(
            all_reward_model_rewards, axis=0)
        self.all_old_policy_probs = np.concatenate(all_old_policy_probs,
                                                   axis=0)
        self.all_rewards = np.concatenate(all_rewards, axis=0)
        self.all_actions = np.concatenate(all_actions, axis=0)

        # mark that static shared data was collected and ready to be used
        self.is_gathered_static_shared_data = True
예제 #4
0
    def train_value_network(self, dataset, epochs):
        loss = []
        batch = Batch(dataset)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # add a timestep to the observation
        # current_states_with_timestep = self.concat_state_and_timestep(dataset)

        mix_fraction = self.ap.algorithm.value_targets_mix_fraction
        total_returns = batch.n_step_discounted_rewards(True)
        for j in range(epochs):
            curr_batch_size = batch.size
            if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
                curr_batch_size = self.ap.network_wrappers['critic'].batch_size
            for i in range(batch.size // curr_batch_size):
                # split to batches for first order optimization techniques
                current_states_batch = {
                    k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
                    for k, v in batch.states(network_keys).items()
                }
                total_return_batch = total_returns[i *
                                                   curr_batch_size:(i + 1) *
                                                   curr_batch_size]
                old_policy_values = force_list(
                    self.networks['critic'].target_network.predict(
                        current_states_batch).squeeze())
                if self.networks[
                        'critic'].online_network.optimizer_type != 'LBFGS':
                    targets = total_return_batch
                else:
                    current_values = self.networks[
                        'critic'].online_network.predict(current_states_batch)
                    targets = current_values * (
                        1 - mix_fraction) + total_return_batch * mix_fraction

                inputs = copy.copy(current_states_batch)
                for input_index, input in enumerate(old_policy_values):
                    name = 'output_0_{}'.format(input_index)
                    if name in self.networks['critic'].online_network.inputs:
                        inputs[name] = input

                value_loss = self.networks[
                    'critic'].online_network.accumulate_gradients(
                        inputs, targets)

                self.networks['critic'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['critic'].apply_gradients_to_global_network()
                self.networks[
                    'critic'].online_network.reset_accumulated_gradients()

                loss.append([value_loss[0]])
        loss = np.mean(loss, 0)
        return loss
예제 #5
0
    def fill_advantages(self, batch):
        batch = Batch(batch)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # current_states_with_timestep = self.concat_state_and_timestep(batch)

        current_state_values = self.networks['critic'].online_network.predict(
            batch.states(network_keys)).squeeze()
        total_returns = batch.n_step_discounted_rewards()
        # calculate advantages
        advantages = []
        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            advantages = total_returns - current_state_values
        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            episode_start_idx = 0
            advantages = np.array([])
            # current_state_values[batch.game_overs()] = 0
            for idx, game_over in enumerate(batch.game_overs()):
                if game_over:
                    # get advantages for the rollout
                    value_bootstrapping = np.zeros((1, ))
                    rollout_state_values = np.append(
                        current_state_values[episode_start_idx:idx + 1],
                        value_bootstrapping)

                    rollout_advantages, _ = \
                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
                                                                     rollout_state_values)
                    episode_start_idx = idx + 1
                    advantages = np.append(advantages, rollout_advantages)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # standardize
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # TODO: this will be problematic with a shared memory
        for transition, advantage in zip(self.memory.transitions, advantages):
            transition.info['advantage'] = advantage

        self.action_advantages.add_sample(advantages)
예제 #6
0
    def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition],
                                  batch_size: int, reward_model: Architecture,
                                  q_network: Architecture,
                                  network_keys: List) -> OpeSharedStats:
        """
        Do the preparations needed for different estimators.
        Some of the calcuations are shared, so we centralize all the work here.

        :param dataset_as_transitions: The evaluation dataset in the form of transitions.
        :param batch_size: The batch size to use.
        :param reward_model: A reward model to be used by DR
        :param q_network: The Q network whose its policy we evaluate.
        :param network_keys: The network keys used for feeding the neural networks.
        :return:
        """
        # IPS
        all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], []
        all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], []

        for i in range(math.ceil(len(dataset_as_transitions) / batch_size)):
            batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))

            # we always use the first Q head to calculate OPEs. might want to change this in the future.
            # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs.
            q_values, sm_values = q_network.predict(
                batch_for_inference.states(network_keys),
                outputs=[
                    q_network.output_heads[0].q_values,
                    q_network.output_heads[0].softmax
                ])

            all_policy_probs.append(sm_values)
            all_v_values_reward_model_based.append(
                np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1],
                       axis=1))
            all_v_values_q_model_based.append(
                np.sum(all_policy_probs[-1] * q_values, axis=1))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

            for j, t in enumerate(batch):
                t.update_info({
                    'q_value':
                    q_values[j],
                    'softmax_policy_prob':
                    all_policy_probs[-1][j],
                    'v_value_q_model_based':
                    all_v_values_q_model_based[-1][j],
                })

        all_reward_model_rewards = np.concatenate(all_reward_model_rewards,
                                                  axis=0)
        all_policy_probs = np.concatenate(all_policy_probs, axis=0)
        all_v_values_reward_model_based = np.concatenate(
            all_v_values_reward_model_based, axis=0)
        all_rewards = np.concatenate(all_rewards, axis=0)
        all_actions = np.concatenate(all_actions, axis=0)
        all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0)

        # generate model probabilities
        new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]),
                                           all_actions]
        rho_all_dataset = new_policy_prob / all_old_policy_probs

        return OpeSharedStats(all_reward_model_rewards, all_policy_probs,
                              all_v_values_reward_model_based, all_rewards,
                              all_actions, all_old_policy_probs,
                              new_policy_prob, rho_all_dataset)
예제 #7
0
    def train_policy_network(self, dataset, epochs):
        loss = []
        for j in range(epochs):
            loss = {
                'total_loss': [],
                'policy_losses': [],
                'unclipped_grads': [],
                'fetch_result': []
            }
            #shuffle(dataset)
            for i in range(
                    len(dataset) //
                    self.ap.network_wrappers['actor'].batch_size):
                batch = Batch(
                    dataset[i *
                            self.ap.network_wrappers['actor'].batch_size:(i +
                                                                          1) *
                            self.ap.network_wrappers['actor'].batch_size])

                network_keys = self.ap.network_wrappers[
                    'actor'].input_embedders_parameters.keys()

                advantages = batch.info('advantage')
                actions = batch.actions()
                if not isinstance(self.spaces.action,
                                  DiscreteActionSpace) and len(
                                      actions.shape) == 1:
                    actions = np.expand_dims(actions, -1)

                # get old policy probabilities and distribution
                old_policy = force_list(
                    self.networks['actor'].target_network.predict(
                        batch.states(network_keys)))

                # calculate gradients and apply on both the local policy network and on the global policy network
                fetches = [
                    self.networks['actor'].online_network.output_heads[0].
                    kl_divergence, self.networks['actor'].online_network.
                    output_heads[0].entropy
                ]

                inputs = copy.copy(batch.states(network_keys))
                inputs['output_0_0'] = actions

                # old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
                # it has just a mean. otherwise, it has both a mean and standard deviation
                for input_index, input in enumerate(old_policy):
                    inputs['output_0_{}'.format(input_index + 1)] = input

                total_loss, policy_losses, unclipped_grads, fetch_result =\
                    self.networks['actor'].online_network.accumulate_gradients(
                        inputs, [advantages], additional_fetches=fetches)

                self.networks['actor'].apply_gradients_to_online_network()
                if isinstance(self.ap.task_parameters,
                              DistributedTaskParameters):
                    self.networks['actor'].apply_gradients_to_global_network()

                self.networks[
                    'actor'].online_network.reset_accumulated_gradients()

                loss['total_loss'].append(total_loss)
                loss['policy_losses'].append(policy_losses)
                loss['unclipped_grads'].append(unclipped_grads)
                loss['fetch_result'].append(fetch_result)

                self.unclipped_grads.add_sample(unclipped_grads)

            for key in loss.keys():
                loss[key] = np.mean(loss[key], 0)

            if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
                curr_learning_rate = self.networks[
                    'critic'].online_network.get_variable_value(
                        self.ap.learning_rate)
                self.curr_learning_rate.add_sample(curr_learning_rate)
            else:
                curr_learning_rate = self.ap.network_wrappers[
                    'critic'].learning_rate

            # log training parameters
            screen.log_dict(OrderedDict([
                ("Surrogate loss", loss['policy_losses'][0]),
                ("KL divergence", loss['fetch_result'][0]),
                ("Entropy", loss['fetch_result'][1]), ("training epoch", j),
                ("learning_rate", curr_learning_rate)
            ]),
                            prefix="Policy training")

        self.total_kl_divergence_during_training_process = loss[
            'fetch_result'][0]
        self.entropy.add_sample(loss['fetch_result'][1])
        self.kl_divergence.add_sample(loss['fetch_result'][0])
        return loss['total_loss']
예제 #8
0
    def improve_reward_model(self, epochs: int):
        """
        Train both a reward model to be used by the doubly-robust estimator, and some model to be used for BCQ

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """

        # we'll be assuming that these gets drawn from the reward model parameters
        batch_size = self.ap.network_wrappers['reward_model'].batch_size
        network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys()

        # if using a NN to decide which actions to drop, we'll train the NN here
        if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters):
            total_epochs = max(epochs, self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs)
        else:
            total_epochs = epochs

        for epoch in range(total_epochs):
            # this is fitted from the training dataset
            reward_model_loss = 0
            imitation_model_loss = 0
            total_transitions_processed = 0
            for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)):
                batch = Batch(batch)

                # reward model
                if epoch < epochs:
                    reward_model_loss += self.get_reward_model_loss(batch)

                # imitation model
                if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters) and \
                        epoch < self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs:
                    target_actions = np.zeros((batch.size, len(self.spaces.action.actions)))
                    target_actions[range(batch.size), batch.actions()] = 1
                    imitation_model_loss += self.networks['imitation_model'].train_and_sync_networks(
                        batch.states(network_keys), target_actions)[0]

                total_transitions_processed += batch.size

            log = OrderedDict()
            log['Epoch'] = epoch

            if reward_model_loss:
                log['Reward Model Loss'] = reward_model_loss / total_transitions_processed
            if imitation_model_loss:
                log['Imitation Model Loss'] = imitation_model_loss / total_transitions_processed

            screen.log_dict(log, prefix='Training Batch RL Models')

        # if using a kNN based model, we'll initialize and build it here.
        # initialization cannot be moved to the constructor as we don't have the agent's spaces initialized yet.
        if isinstance(self.ap.algorithm.action_drop_method_parameters, KNNParameters):
            knn_size = self.ap.algorithm.action_drop_method_parameters.knn_size
            if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
                self.knn_trees = [AnnoyDictionary(
                    dict_size=knn_size,
                    key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]),
                    batch_size=knn_size)
                    for _ in range(len(self.spaces.action.actions))]
            else:
                self.knn_trees = [AnnoyDictionary(
                    dict_size=knn_size,
                    key_width=self.spaces.state['observation'].shape[0],
                    batch_size=knn_size)
                    for _ in range(len(self.spaces.action.actions))]

            for i, knn_tree in enumerate(self.knn_trees):
                state_embeddings = self.embedding([transition.state for transition in self.memory.transitions
                                if transition.action == i])
                knn_tree.add(
                    keys=state_embeddings,
                    values=np.expand_dims(np.zeros(state_embeddings.shape[0]), axis=1))

            for knn_tree in self.knn_trees:
                knn_tree._rebuild_index()

            self.average_dist = [[dist[0] for dist in knn_tree._get_k_nearest_neighbors_indices(
                keys=self.embedding([transition.state for transition in self.memory.transitions]),
                k=1)[0]] for knn_tree in self.knn_trees]
            self.average_dist = sum([x for l in self.average_dist for x in l])  # flatten and sum
            self.average_dist /= len(self.memory.transitions)