def gather_static_shared_stats( self, evaluation_dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, network_keys: List) -> None: all_reward_model_rewards = [] all_old_policy_probs = [] all_rewards = [] all_actions = [] for i in range( math.ceil(len(evaluation_dataset_as_transitions) / batch_size)): batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) self.all_reward_model_rewards = np.concatenate( all_reward_model_rewards, axis=0) self.all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) self.all_rewards = np.concatenate(all_rewards, axis=0) self.all_actions = np.concatenate(all_actions, axis=0) # mark that static shared data was collected and ready to be used self.is_gathered_static_shared_data = True
def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, q_network: Architecture, network_keys: List) -> OpeSharedStats: """ Do the preparations needed for different estimators. Some of the calcuations are shared, so we centralize all the work here. :param dataset_as_transitions: The evaluation dataset in the form of transitions. :param batch_size: The batch size to use. :param reward_model: A reward model to be used by DR :param q_network: The Q network whose its policy we evaluate. :param network_keys: The network keys used for feeding the neural networks. :return: """ # IPS all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], [] all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], [] for i in range(math.ceil(len(dataset_as_transitions) / batch_size)): batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) # we always use the first Q head to calculate OPEs. might want to change this in the future. # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs. q_values, sm_values = q_network.predict( batch_for_inference.states(network_keys), outputs=[ q_network.output_heads[0].q_values, q_network.output_heads[0].softmax ]) all_policy_probs.append(sm_values) all_v_values_reward_model_based.append( np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1], axis=1)) all_v_values_q_model_based.append( np.sum(all_policy_probs[-1] * q_values, axis=1)) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) for j, t in enumerate(batch): t.update_info({ 'q_value': q_values[j], 'softmax_policy_prob': all_policy_probs[-1][j], 'v_value_q_model_based': all_v_values_q_model_based[-1][j], }) all_reward_model_rewards = np.concatenate(all_reward_model_rewards, axis=0) all_policy_probs = np.concatenate(all_policy_probs, axis=0) all_v_values_reward_model_based = np.concatenate( all_v_values_reward_model_based, axis=0) all_rewards = np.concatenate(all_rewards, axis=0) all_actions = np.concatenate(all_actions, axis=0) all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) # generate model probabilities new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]), all_actions] rho_all_dataset = new_policy_prob / all_old_policy_probs return OpeSharedStats(all_reward_model_rewards, all_policy_probs, all_v_values_reward_model_based, all_rewards, all_actions, all_old_policy_probs, new_policy_prob, rho_all_dataset)
def train_policy_network(self, dataset, epochs): loss = [] for j in range(epochs): loss = { 'total_loss': [], 'policy_losses': [], 'unclipped_grads': [], 'fetch_result': [] } #shuffle(dataset) for i in range( len(dataset) // self.ap.network_wrappers['actor'].batch_size): batch = Batch( dataset[i * self.ap.network_wrappers['actor'].batch_size:(i + 1) * self.ap.network_wrappers['actor'].batch_size]) network_keys = self.ap.network_wrappers[ 'actor'].input_embedders_parameters.keys() advantages = batch.info('advantage') actions = batch.actions() if not isinstance(self.spaces.action, DiscreteActionSpace) and len( actions.shape) == 1: actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution old_policy = force_list( self.networks['actor'].target_network.predict( batch.states(network_keys))) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [ self.networks['actor'].online_network.output_heads[0]. kl_divergence, self.networks['actor'].online_network. output_heads[0].entropy ] inputs = copy.copy(batch.states(network_keys)) inputs['output_0_0'] = actions # old_policy_distribution needs to be represented as a list, because in the event of discrete controls, # it has just a mean. otherwise, it has both a mean and standard deviation for input_index, input in enumerate(old_policy): inputs['output_0_{}'.format(input_index + 1)] = input total_loss, policy_losses, unclipped_grads, fetch_result =\ self.networks['actor'].online_network.accumulate_gradients( inputs, [advantages], additional_fetches=fetches) self.networks['actor'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['actor'].apply_gradients_to_global_network() self.networks[ 'actor'].online_network.reset_accumulated_gradients() loss['total_loss'].append(total_loss) loss['policy_losses'].append(policy_losses) loss['unclipped_grads'].append(unclipped_grads) loss['fetch_result'].append(fetch_result) self.unclipped_grads.add_sample(unclipped_grads) for key in loss.keys(): loss[key] = np.mean(loss[key], 0) if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0: curr_learning_rate = self.networks[ 'critic'].online_network.get_variable_value( self.ap.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.ap.network_wrappers[ 'critic'].learning_rate # log training parameters screen.log_dict(OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), ("training epoch", j), ("learning_rate", curr_learning_rate) ]), prefix="Policy training") self.total_kl_divergence_during_training_process = loss[ 'fetch_result'][0] self.entropy.add_sample(loss['fetch_result'][1]) self.kl_divergence.add_sample(loss['fetch_result'][0]) return loss['total_loss']