def improve_reward_model(self, epochs: int): """ Train a reward model to be used by the doubly-robust estimator :param epochs: The total number of epochs to use for training a reward model :return: None """ batch_size = self.ap.network_wrappers['reward_model'].batch_size network_keys = self.ap.network_wrappers[ 'reward_model'].input_embedders_parameters.keys() # this is fitted from the training dataset for epoch in range(epochs): loss = 0 total_transitions_processed = 0 for i, batch in enumerate( self.call_memory('get_shuffled_data_generator', batch_size)): batch = Batch(batch) current_rewards_prediction_for_all_actions = self.networks[ 'reward_model'].online_network.predict( batch.states(network_keys)) current_rewards_prediction_for_all_actions[ range(batch.size), batch.actions()] = batch.rewards() loss += self.networks['reward_model'].train_and_sync_networks( batch.states(network_keys), current_rewards_prediction_for_all_actions)[0] total_transitions_processed += batch.size log = OrderedDict() log['Epoch'] = epoch log['loss'] = loss / total_transitions_processed screen.log_dict(log, prefix='Training Reward Model')
def get_reward_model_loss(self, batch: Batch): network_keys = self.ap.network_wrappers[ 'reward_model'].input_embedders_parameters.keys() current_rewards_prediction_for_all_actions = self.networks[ 'reward_model'].online_network.predict(batch.states(network_keys)) current_rewards_prediction_for_all_actions[ range(batch.size), batch.actions()] = batch.rewards() return self.networks['reward_model'].train_and_sync_networks( batch.states(network_keys), current_rewards_prediction_for_all_actions)[0]
def gather_static_shared_stats( self, evaluation_dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, network_keys: List) -> None: all_reward_model_rewards = [] all_old_policy_probs = [] all_rewards = [] all_actions = [] for i in range( math.ceil(len(evaluation_dataset_as_transitions) / batch_size)): batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) self.all_reward_model_rewards = np.concatenate( all_reward_model_rewards, axis=0) self.all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) self.all_rewards = np.concatenate(all_rewards, axis=0) self.all_actions = np.concatenate(all_actions, axis=0) # mark that static shared data was collected and ready to be used self.is_gathered_static_shared_data = True
def train_value_network(self, dataset, epochs): loss = [] batch = Batch(dataset) network_keys = self.ap.network_wrappers[ 'critic'].input_embedders_parameters.keys() # * Found not to have any impact * # add a timestep to the observation # current_states_with_timestep = self.concat_state_and_timestep(dataset) mix_fraction = self.ap.algorithm.value_targets_mix_fraction total_returns = batch.n_step_discounted_rewards(True) for j in range(epochs): curr_batch_size = batch.size if self.networks['critic'].online_network.optimizer_type != 'LBFGS': curr_batch_size = self.ap.network_wrappers['critic'].batch_size for i in range(batch.size // curr_batch_size): # split to batches for first order optimization techniques current_states_batch = { k: v[i * curr_batch_size:(i + 1) * curr_batch_size] for k, v in batch.states(network_keys).items() } total_return_batch = total_returns[i * curr_batch_size:(i + 1) * curr_batch_size] old_policy_values = force_list( self.networks['critic'].target_network.predict( current_states_batch).squeeze()) if self.networks[ 'critic'].online_network.optimizer_type != 'LBFGS': targets = total_return_batch else: current_values = self.networks[ 'critic'].online_network.predict(current_states_batch) targets = current_values * ( 1 - mix_fraction) + total_return_batch * mix_fraction inputs = copy.copy(current_states_batch) for input_index, input in enumerate(old_policy_values): name = 'output_0_{}'.format(input_index) if name in self.networks['critic'].online_network.inputs: inputs[name] = input value_loss = self.networks[ 'critic'].online_network.accumulate_gradients( inputs, targets) self.networks['critic'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['critic'].apply_gradients_to_global_network() self.networks[ 'critic'].online_network.reset_accumulated_gradients() loss.append([value_loss[0]]) loss = np.mean(loss, 0) return loss
def fill_advantages(self, batch): batch = Batch(batch) network_keys = self.ap.network_wrappers[ 'critic'].input_embedders_parameters.keys() # * Found not to have any impact * # current_states_with_timestep = self.concat_state_and_timestep(batch) current_state_values = self.networks['critic'].online_network.predict( batch.states(network_keys)).squeeze() total_returns = batch.n_step_discounted_rewards() # calculate advantages advantages = [] if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: advantages = total_returns - current_state_values elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) # current_state_values[batch.game_overs()] = 0 for idx, game_over in enumerate(batch.game_overs()): if game_over: # get advantages for the rollout value_bootstrapping = np.zeros((1, )) rollout_state_values = np.append( current_state_values[episode_start_idx:idx + 1], value_bootstrapping) rollout_advantages, _ = \ self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1], rollout_state_values) episode_start_idx = idx + 1 advantages = np.append(advantages, rollout_advantages) else: screen.warning( "WARNING: The requested policy gradient rescaler is not available" ) # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) # TODO: this will be problematic with a shared memory for transition, advantage in zip(self.memory.transitions, advantages): transition.info['advantage'] = advantage self.action_advantages.add_sample(advantages)
def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition], batch_size: int, reward_model: Architecture, q_network: Architecture, network_keys: List) -> OpeSharedStats: """ Do the preparations needed for different estimators. Some of the calcuations are shared, so we centralize all the work here. :param dataset_as_transitions: The evaluation dataset in the form of transitions. :param batch_size: The batch size to use. :param reward_model: A reward model to be used by DR :param q_network: The Q network whose its policy we evaluate. :param network_keys: The network keys used for feeding the neural networks. :return: """ # IPS all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], [] all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], [] for i in range(math.ceil(len(dataset_as_transitions) / batch_size)): batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size] batch_for_inference = Batch(batch) all_reward_model_rewards.append( reward_model.predict(batch_for_inference.states(network_keys))) # we always use the first Q head to calculate OPEs. might want to change this in the future. # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs. q_values, sm_values = q_network.predict( batch_for_inference.states(network_keys), outputs=[ q_network.output_heads[0].q_values, q_network.output_heads[0].softmax ]) all_policy_probs.append(sm_values) all_v_values_reward_model_based.append( np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1], axis=1)) all_v_values_q_model_based.append( np.sum(all_policy_probs[-1] * q_values, axis=1)) all_rewards.append(batch_for_inference.rewards()) all_actions.append(batch_for_inference.actions()) all_old_policy_probs.append( batch_for_inference.info('all_action_probabilities')[ range(len(batch_for_inference.actions())), batch_for_inference.actions()]) for j, t in enumerate(batch): t.update_info({ 'q_value': q_values[j], 'softmax_policy_prob': all_policy_probs[-1][j], 'v_value_q_model_based': all_v_values_q_model_based[-1][j], }) all_reward_model_rewards = np.concatenate(all_reward_model_rewards, axis=0) all_policy_probs = np.concatenate(all_policy_probs, axis=0) all_v_values_reward_model_based = np.concatenate( all_v_values_reward_model_based, axis=0) all_rewards = np.concatenate(all_rewards, axis=0) all_actions = np.concatenate(all_actions, axis=0) all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0) # generate model probabilities new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]), all_actions] rho_all_dataset = new_policy_prob / all_old_policy_probs return OpeSharedStats(all_reward_model_rewards, all_policy_probs, all_v_values_reward_model_based, all_rewards, all_actions, all_old_policy_probs, new_policy_prob, rho_all_dataset)
def train_policy_network(self, dataset, epochs): loss = [] for j in range(epochs): loss = { 'total_loss': [], 'policy_losses': [], 'unclipped_grads': [], 'fetch_result': [] } #shuffle(dataset) for i in range( len(dataset) // self.ap.network_wrappers['actor'].batch_size): batch = Batch( dataset[i * self.ap.network_wrappers['actor'].batch_size:(i + 1) * self.ap.network_wrappers['actor'].batch_size]) network_keys = self.ap.network_wrappers[ 'actor'].input_embedders_parameters.keys() advantages = batch.info('advantage') actions = batch.actions() if not isinstance(self.spaces.action, DiscreteActionSpace) and len( actions.shape) == 1: actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution old_policy = force_list( self.networks['actor'].target_network.predict( batch.states(network_keys))) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [ self.networks['actor'].online_network.output_heads[0]. kl_divergence, self.networks['actor'].online_network. output_heads[0].entropy ] inputs = copy.copy(batch.states(network_keys)) inputs['output_0_0'] = actions # old_policy_distribution needs to be represented as a list, because in the event of discrete controls, # it has just a mean. otherwise, it has both a mean and standard deviation for input_index, input in enumerate(old_policy): inputs['output_0_{}'.format(input_index + 1)] = input total_loss, policy_losses, unclipped_grads, fetch_result =\ self.networks['actor'].online_network.accumulate_gradients( inputs, [advantages], additional_fetches=fetches) self.networks['actor'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['actor'].apply_gradients_to_global_network() self.networks[ 'actor'].online_network.reset_accumulated_gradients() loss['total_loss'].append(total_loss) loss['policy_losses'].append(policy_losses) loss['unclipped_grads'].append(unclipped_grads) loss['fetch_result'].append(fetch_result) self.unclipped_grads.add_sample(unclipped_grads) for key in loss.keys(): loss[key] = np.mean(loss[key], 0) if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0: curr_learning_rate = self.networks[ 'critic'].online_network.get_variable_value( self.ap.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.ap.network_wrappers[ 'critic'].learning_rate # log training parameters screen.log_dict(OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), ("training epoch", j), ("learning_rate", curr_learning_rate) ]), prefix="Policy training") self.total_kl_divergence_during_training_process = loss[ 'fetch_result'][0] self.entropy.add_sample(loss['fetch_result'][1]) self.kl_divergence.add_sample(loss['fetch_result'][0]) return loss['total_loss']
def improve_reward_model(self, epochs: int): """ Train both a reward model to be used by the doubly-robust estimator, and some model to be used for BCQ :param epochs: The total number of epochs to use for training a reward model :return: None """ # we'll be assuming that these gets drawn from the reward model parameters batch_size = self.ap.network_wrappers['reward_model'].batch_size network_keys = self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys() # if using a NN to decide which actions to drop, we'll train the NN here if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters): total_epochs = max(epochs, self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs) else: total_epochs = epochs for epoch in range(total_epochs): # this is fitted from the training dataset reward_model_loss = 0 imitation_model_loss = 0 total_transitions_processed = 0 for i, batch in enumerate(self.call_memory('get_shuffled_training_data_generator', batch_size)): batch = Batch(batch) # reward model if epoch < epochs: reward_model_loss += self.get_reward_model_loss(batch) # imitation model if isinstance(self.ap.algorithm.action_drop_method_parameters, NNImitationModelParameters) and \ epoch < self.ap.algorithm.action_drop_method_parameters.imitation_model_num_epochs: target_actions = np.zeros((batch.size, len(self.spaces.action.actions))) target_actions[range(batch.size), batch.actions()] = 1 imitation_model_loss += self.networks['imitation_model'].train_and_sync_networks( batch.states(network_keys), target_actions)[0] total_transitions_processed += batch.size log = OrderedDict() log['Epoch'] = epoch if reward_model_loss: log['Reward Model Loss'] = reward_model_loss / total_transitions_processed if imitation_model_loss: log['Imitation Model Loss'] = imitation_model_loss / total_transitions_processed screen.log_dict(log, prefix='Training Batch RL Models') # if using a kNN based model, we'll initialize and build it here. # initialization cannot be moved to the constructor as we don't have the agent's spaces initialized yet. if isinstance(self.ap.algorithm.action_drop_method_parameters, KNNParameters): knn_size = self.ap.algorithm.action_drop_method_parameters.knn_size if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state: self.knn_trees = [AnnoyDictionary( dict_size=knn_size, key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]), batch_size=knn_size) for _ in range(len(self.spaces.action.actions))] else: self.knn_trees = [AnnoyDictionary( dict_size=knn_size, key_width=self.spaces.state['observation'].shape[0], batch_size=knn_size) for _ in range(len(self.spaces.action.actions))] for i, knn_tree in enumerate(self.knn_trees): state_embeddings = self.embedding([transition.state for transition in self.memory.transitions if transition.action == i]) knn_tree.add( keys=state_embeddings, values=np.expand_dims(np.zeros(state_embeddings.shape[0]), axis=1)) for knn_tree in self.knn_trees: knn_tree._rebuild_index() self.average_dist = [[dist[0] for dist in knn_tree._get_k_nearest_neighbors_indices( keys=self.embedding([transition.state for transition in self.memory.transitions]), k=1)[0]] for knn_tree in self.knn_trees] self.average_dist = sum([x for l in self.average_dist for x in l]) # flatten and sum self.average_dist /= len(self.memory.transitions)