def line_search(model, f, x, fullstep, expected_improve_full, get_kl_diff, max_kl, max_backtracks=20, accept_ratio=0.01): fval = f(True).data[0] steps = [.5 ** x for x in range(max_backtracks)] for stepfrac in steps: x_new = x + stepfrac * fullstep # check x_new for NAN, it happened when optimizing cartpole model that somehow theta became NAN if torch.sum(torch.isnan(x_new)) > 0: logger.log("we somehow got NAN in linesearch", x, stepfrac, fullstep) continue torch_utils.set_flat_params_to(model, x_new) fval_new = f(True).data[0] actual_improve = fval - fval_new mean_kl = get_kl_diff(x,x_new) tolerance = max_kl*0.5 if actual_improve > 0 and mean_kl <= max_kl + tolerance: return True, x_new logger.log("backtrack") if actual_improve <= 0 : logger.log("Violated because loss not improving. New loss: %f Old loss: %f" % (fval_new, fval)) if mean_kl > max_kl + tolerance: logger.log("Violated because kl bound does not hold. MaxKL: %f MeanKL: %f" % (max_kl, mean_kl)) if actual_improve <= 0: logger.log("Violated because loss not improving. New loss: %f Old loss: %f" % (fval_new, fval)) if mean_kl > max_kl + tolerance: logger.log("Violated because kl bound does not hold. MaxKL: %f MeanKL: %f" % (max_kl, mean_kl)) return False, x
def set_param_values(self, params): # assume that params is a tensor of shape 5, theta has dim 7 and we have 2 dimensional noise if not params.shape[0] == 9 and not self.trainMotor: raise ValueError("something went wrong") elif not params.shape[0] == 11 and self.trainMotor: raise ValueError("something went wrong") # prevent negative parameters by using absolute value torch_utils.set_flat_params_to(self, params)
def callback_fun(flat_params): nonlocal curr_itr torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params)) # calculate the loss of the whole batch loss = - torch.mean(imitation_model.get_log_prob(input, output)) # weight decay for param in imitation_model.parameters(): loss += param.pow(2).sum() * self.l2_reg loss.backward() if isinstance(self.imitationModel, CartPoleModel): logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy())) logger.record_tabular("std", str(self.imitationModel.std.detach().numpy())) logger.record_tabular('Iteration', curr_itr) logger.record_tabular("loss", loss.item()) logger.dump_tabular(with_prefix=False) curr_itr += 1
def get_value_loss(flat_params): torch_utils.set_flat_params_to(value_net, torch_utils.torch.from_numpy(flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_pred = value_net(Variable(states)) value_loss = (values_pred - values_target).pow(2).mean() # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * self.l2_reg value_loss.backward() # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version? return value_loss.data.cpu().numpy(), \ torch_utils.get_flat_grad_from( value_net.parameters()).data.cpu().numpy(). \ astype(np.float64)
def get_negative_likelihood_loss(flat_params): torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params)) for param in imitation_model.parameters(): if param.grad is not None: param.grad.data.fill_(0) indices = np.random.permutation(np.arange(total_batchsize)) loss = - torch.mean(imitation_model.get_log_prob(input[indices[:self.mini_batchsize]], output[indices[:self.mini_batchsize]])) # weight decay for param in imitation_model.parameters(): loss += param.pow(2).sum() * self.l2_reg loss.backward() # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version? return loss.detach().numpy(), \ torch_utils.get_flat_grad_from( imitation_model.parameters()).detach().numpy(). \ astype(np.float64)
def get_kl_diff(old_param, new_param): prev_params = torch_utils.get_flat_params_from(policy_net) with torch.no_grad(): torch_utils.set_flat_params_to(policy_net, old_param) log_old_prob = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)) torch_utils.set_flat_params_to(policy_net, new_param) log_new_prob = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)) torch_utils.set_flat_params_to(policy_net, prev_params) return torch.mean(torch.exp(log_old_prob) * (log_old_prob-log_new_prob)).numpy()
def summarize_and_plot_results(folderName, iteration, sample_n_traj, policy_path, expert_data_path, plot_trajectories=True): experiment_folder_name = str(folderName) + "/" eval_labels = [ 'avg_discounted_return', 'avg_undiscounted_return', 'avg_traj_length', 'avg_displacement', 'avg_min_displacement', 'avg_boundaries_left', 'avg_success_rate', 'avg_norm_displacement' ] print(experiment_folder_name) num_experiment_trial = 0 # find all trials of the experiment for fn in os.listdir(experiment_folder_name): path = os.path.join(experiment_folder_name, fn) model_path = os.path.join(path, "itr_0.pkl") if not os.path.exists(model_path) and not os.path.exists( os.path.join(path, "itr_" + str(iteration) + ".pkl")): continue if num_experiment_trial == 0: # read one variant.json testwise to obtain the columns for the data frame fileName = path + '/variant.json' with open(fileName, 'r') as read_file: data = json.load(read_file) col = ['experiment_itr'] variant_keys = data.keys() col += data.keys() col += eval_labels num_experiment_trial += 1 # create a empty data frame for the results df = pd.DataFrame(index=np.arange(0, num_experiment_trial), columns=col) # create different data frame for plotting eval_measures_label = eval_labels num_eval_measures = len(eval_measures_label) df_plot = pd.DataFrame( index=np.arange(0, num_experiment_trial * num_eval_measures), columns=('expr_itr', 'variant', 'eval_type', 'value')) expr_idx = 0 df_plot_idx = 0 # create a data_frame which is used to plot results of the training iterations eval_measures_itr_label = ['NumTrajs', 'AverageReturn', 'Entropy'] dataFramePlotItr = [] for fn in sorted(os.listdir(experiment_folder_name)): path = os.path.join(experiment_folder_name, fn) model_path = os.path.join(path, "itr_0.pkl") if not os.path.exists(model_path) and not os.path.exists( os.path.join(path, "itr_" + str(iteration) + ".pkl")): continue # check if we have a model for the iteration, if yes load this if os.path.exists(os.path.join(path, "itr_" + str(iteration) + ".pkl")): model_path = os.path.join(path, "itr_" + str(iteration) + ".pkl") else: # take the newest iteration we have for itr in range(iteration, 0, -1): if os.path.exists( os.path.join(path, "itr_" + str(itr) + ".pkl")): model_path = os.path.join(path, "itr_" + str(itr) + ".pkl") break model_param_path = "empty" # check if we have a model parameters if os.path.exists( os.path.join(path, "itr_" + str(iteration) + "_model.pkl")): model_param_path = os.path.join( path, "itr_" + str(iteration) + "_model.pkl") else: # take the newest iteration we have for itr in range(iteration, 0, -1): if os.path.exists( os.path.join(path, "itr_" + str(itr) + "_model.pkl")): model_param_path = os.path.join( path, "itr_" + str(itr) + "_model.pkl") break fileName = path + '/variant.json' with open(fileName, 'r') as read_file: data = json.load(read_file) # get the values and convert them to strings to store them easier variant_values = [ str(data[x]) if hasattr(data, x) else "None" for x in variant_keys ] if os.stat(path + '/progress.csv').st_size > 0: # only process csv if it is not empty unprocessed_df = pd.read_csv(path + '/progress.csv') temp_frames = [] isGailExperiment = True try: for label in eval_measures_itr_label: temp_frame = pd.DataFrame(unprocessed_df[label]) temp_frame.columns = ['value'] temp_frame['eval_type'] = label temp_frames.append(temp_frame) processed_df = pd.concat(temp_frames) processed_df['Iteration'] = unprocessed_df['Iteration'] processed_df['expr_idx'] = expr_idx processed_df['variant'] = "v" + str(expr_idx) dataFramePlotItr.append(processed_df) except KeyError: isGailExperiment = False # load policy for gaussian noise for sigma^2: 0.05 and 0.01 data = joblib.load(policy_path) policy = data['policy'] # policy.action_log_std = torch.nn.Parameter( # torch.zeros(1, 1)) # set variance to zero to have a deterministic policy policy.normalized_input = [False, False, False, False] policy.normalized_output = [True] shooting_experiment = False # load learned model model = joblib.load(model_path) if 'imitationModel' in model: imitation_env = model['imitationModel'] elif 'imitationEnv' in model: imitation_env = model['imitationEnv'] elif 'parameters' in model: learned_params = model['parameters'].squeeze(0) shooting_experiment = True # create a new environment and set the parameters to the learned ones from rllab.dynamic_models.cartpole_model import CartPoleModel imitation_env = CartPoleModel() # set the variance to 0 to have a determinisitic environment imitation_env.set_param_values( torch.from_numpy(np.concatenate([learned_params, np.zeros(2)])).float()) # use original variance to have a stochastic environment # get variance from true env # print(env.std) # imitation_env.set_param_values(torch.from_numpy(np.concatenate([learned_params, env.std])).float()) if 'upperLevelPolicy' in model: upperLevelPolicy = model['upperLevelPolicy'] torch_utils.set_flat_params_to(imitation_env, upperLevelPolicy.mean) elif not shooting_experiment: imitation_env.load_state_dict(torch.load(model_param_path)) # load fixed expert trajectories expert_paths = joblib.load(expert_data_path) # # collect batch of expert_data containing n trajectories sampler = TrajSampler(policy, imitation_env, sample_n_traj, 500, discount=0.995, useImitationPolicy=False, useImitationEnv=False, terminate_only_max_path=True) expert_processed_paths = sampler.process_samples(0, expert_paths) ## check if we need to do the rollouts or if we can just load in the processed_paths if os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) and \ os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")): generated_paths = joblib.load( os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) generated_processed_paths = joblib.load( os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")) else: sampler.start_worker() generated_paths = sampler.obtain_samples(0) generated_processed_paths = sampler.process_samples( 0, generated_paths) # save processed paths s.t. we don't need to do the rollouts every time joblib.dump(generated_paths, os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl"), compress=3) joblib.dump( generated_processed_paths, os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl"), compress=3) avg_discounted_return = sampler.calc_avg_discounted_return( generated_processed_paths) avg_undiscounted_return = sampler.calc_avg_undiscounted_return( generated_processed_paths) avg_traj_length = sampler.calc_avg_traj_length( generated_processed_paths) avg_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole") avg_displacement = np.mean(avg_displacements) avg_norm_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole", normalize=True) avg_norm_displacement = np.mean(avg_norm_displacements) avg_min_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole", "min") avg_min_displacement = np.mean(avg_min_displacements) avg_leave_boundaries = calc_leaving_boundaries_rate( generated_processed_paths) success_rate = calc_success_rate(generated_processed_paths) values = [ avg_discounted_return, avg_undiscounted_return, avg_traj_length, avg_displacement, avg_min_displacement, avg_leave_boundaries, success_rate, avg_norm_displacement ] if not os.path.exists(experiment_folder_name + "displacements_" + str(iteration) + "/"): os.makedirs(experiment_folder_name + "displacements_" + str(iteration) + "/") all_displacements = { 'avg_displacements': avg_displacements, 'avg_min_displacements': avg_min_displacements, 'avg_norm_displacements': avg_norm_displacements } joblib.dump( all_displacements, experiment_folder_name + "displacements_" + str(iteration) + "/" + fn + ".pkl") df.loc[expr_idx] = [expr_idx] + variant_values + values for j, value in zip(range(num_eval_measures), values): # build variant string variant_string = "v" + str(expr_idx) df_plot.loc[df_plot_idx + j] = [ expr_idx, variant_string, eval_measures_label[j], value ] expr_idx += 1 df_plot_idx += num_eval_measures # write results to folder for plotting df_plot.to_csv(experiment_folder_name + "results_plot_frame_" + str(iteration) + ".csv", sep=',', encoding='utf-8') if plot_trajectories: plot_n_traj = 20 # plot the same number as in the supplement # check if plot already are already existing if yes don't plot if os.path.exists(experiment_folder_name + "no_cut_"+ str(iteration)+"/" + fn + ".png") and \ os.path.exists(experiment_folder_name + "cut_"+ str(iteration)+"/" + fn + ".png"): continue # create plot with 25 trajectory from real env and learned env sns.set(context="notebook", style="darkgrid") title = ['x_pos', 'x_dot', 'sin theta', 'cos theta', 'theta_dot'] fig, axes = plt.subplots(len(title), 1, tight_layout=True) # the size of A4 paper + 4 inch addtional height fig.set_size_inches(11.7, 12.27) # merge all observations of all paths into one big data frame dataFrames = [] print("building data frame") for i in range(plot_n_traj): trajlen = expert_paths[i]["observations"].shape[0] modObservations = np.zeros((trajlen, len(title))) modObservations[:, 0:2] = expert_paths[i]["observations"][:, 0:2] modObservations[:, 2] = np.sin(expert_paths[i]["observations"][:, 2]) modObservations[:, 3] = np.cos(expert_paths[i]["observations"][:, 2]) modObservations[:, 4] = expert_paths[i]["observations"][:, 3] trajDataFrame = pd.DataFrame(np.concatenate([ i * np.ones(trajlen)[:, np.newaxis], np.arange(trajlen)[:, np.newaxis], modObservations ], axis=1), columns=['traj', 'timestep'] + title) dataFrames.append(trajDataFrame) all_traj_true_env = pd.concat(dataFrames) all_traj_true_env["model"] = "true environment" # now add 10 traj from learned env dataFrames = [] for i in range(plot_n_traj): trajlen = generated_paths[i]["observations"].shape[0] modObservations = np.zeros((trajlen, len(title))) modObservations[:, 0:2] = generated_paths[i]["observations"][:, 0:2] modObservations[:, 2] = np.sin( generated_paths[i]["observations"][:, 2]) modObservations[:, 3] = np.cos( generated_paths[i]["observations"][:, 2]) modObservations[:, 4] = generated_paths[i]["observations"][:, 3] trajDataFrame = pd.DataFrame(np.concatenate([ i * np.ones(trajlen)[:, np.newaxis], np.arange(trajlen)[:, np.newaxis], modObservations ], axis=1), columns=['traj', 'timestep'] + title) dataFrames.append(trajDataFrame) all_traj_learned_env = pd.concat(dataFrames) all_traj_learned_env["model"] = "learned environment" all_traj = pd.concat([all_traj_true_env, all_traj_learned_env]) print("finished building data frame") print("start plotting") for i in range(len(title)): ax = sns.lineplot(x='timestep', y=title[i], hue="model", estimator=None, units='traj', data=all_traj, ax=axes[i], legend=False) if i == 0: # add additional 2 dashed lines for the boundary ax.plot([0, 500], [3, 3], color='k', linestyle='--') ax.plot([0, 500], [-3, -3], color='k', linestyle='--') ax.set_xlim(-5, 505) ax.set_title(title[i]) plt.savefig(path + "/cartpole_results.png", bbox_inches='tight', pad_inches=0.1) plt.close()
def summarize_and_plot_results(folderPath, iteration, sample_n_traj, policy_path, expert_data_path, plot_trajectories=True): experiment_folder_name = str(folderPath.parent) + "/" print(experiment_folder_name) eval_labels = [ 'avg_discounted_return', 'avg_undiscounted_return', 'avg_traj_length', 'avg_displacement', 'avg_min_displacement', 'avg_boundaries_left', 'avg_success_rate', 'avg_norm_displacement' ] num_experiment_trial = 0 # find all trials of the experiment for fn in os.listdir(experiment_folder_name): path = os.path.join(experiment_folder_name, fn) model_path = os.path.join(path, "itr_0.pkl") if not os.path.exists(model_path) and not os.path.exists( os.path.join(path, "itr_" + str(iteration) + ".pkl")): continue if num_experiment_trial == 0: # read one variant.json testwise to obtain the columns for the data frame fileName = path + '/variant.json' with open(fileName, 'r') as read_file: data = json.load(read_file) col = ['experiment_itr'] variant_keys = data.keys() col += data.keys() col += eval_labels num_experiment_trial += 1 # create a empty data frame for the results df = pd.DataFrame(index=np.arange(0, num_experiment_trial), columns=col) # create different data frame for plotting eval_measures_label = eval_labels num_eval_measures = len(eval_measures_label) df_plot = pd.DataFrame( index=np.arange(0, num_experiment_trial * num_eval_measures), columns=('expr_itr', 'variant', 'eval_type', 'value')) expr_idx = 0 df_plot_idx = 0 # create a data_frame which is used to plot results of the training iterations eval_measures_itr_label = ['NumTrajs', 'AverageReturn', 'Entropy'] dataFramePlotItr = [] for fn in sorted(os.listdir(experiment_folder_name)): path = os.path.join(experiment_folder_name, fn) model_path = os.path.join(path, "itr_0.pkl") if not os.path.exists(model_path) and not os.path.exists( os.path.join(path, "itr_" + str(iteration) + ".pkl")): continue # check if we have a model for the iteration, if yes load this if os.path.exists(os.path.join(path, "itr_" + str(iteration) + ".pkl")): model_path = os.path.join(path, "itr_" + str(iteration) + ".pkl") else: # take the newest iteration we have for itr in range(iteration, 0, -1): if os.path.exists( os.path.join(path, "itr_" + str(itr) + ".pkl")): model_path = os.path.join(path, "itr_" + str(itr) + ".pkl") break model_param_path = "empty" # check if we have a model parameters if os.path.exists( os.path.join(path, "itr_" + str(iteration) + "_model.pkl")): model_param_path = os.path.join( path, "itr_" + str(iteration) + "_model.pkl") else: # take the newest iteration we have for itr in range(iteration, 0, -1): if os.path.exists( os.path.join(path, "itr_" + str(itr) + "_model.pkl")): model_param_path = os.path.join( path, "itr_" + str(itr) + "_model.pkl") break print(model_path) print(model_param_path) fileName = path + '/variant.json' with open(fileName, 'r') as read_file: data = json.load(read_file) # get the values and convert them to strings to store them easier variant_values = [ str(data[x]) if hasattr(data, x) else "None" for x in variant_keys ] if os.stat(path + '/progress.csv').st_size > 0: # only process csv if it is not empty unprocessed_df = pd.read_csv(path + '/progress.csv') temp_frames = [] isGailExperiment = True try: for label in eval_measures_itr_label: temp_frame = pd.DataFrame(unprocessed_df[label]) temp_frame.columns = ['value'] temp_frame['eval_type'] = label temp_frames.append(temp_frame) processed_df = pd.concat(temp_frames) processed_df['Iteration'] = unprocessed_df['Iteration'] processed_df['expr_idx'] = expr_idx processed_df['variant'] = "v" + str(expr_idx) dataFramePlotItr.append(processed_df) except KeyError: isGailExperiment = False # load policy for gaussian noise for sigma^2: 0.05 and 0.01 data = joblib.load(policy_path) policy = data['policy'] # policy.action_log_std = torch.nn.Parameter( # torch.zeros(1, 1)) # set variance to zero to have a deterministic policy policy.normalized_input = [False, False, False, False] policy.normalized_output = [True] shooting_experiment = False # load learned model model = joblib.load(model_path) if 'imitationModel' in model: imitation_env = model['imitationModel'] else: imitation_env = model['imitationEnv'] if 'parameters' in model: learned_params = model['parameters'].squeeze(0) shooting_experiment = True # create a new environment and set the parameters to the learned ones from rllab.dynamic_models.cartpole_model import CartPoleModel imitation_env = CartPoleModel() # set the variance to 0 to have a determinisitic environment imitation_env.set_param_values( torch.from_numpy(np.concatenate([learned_params, np.zeros(2)])).float()) # use original variance to have a stochastic environment # get variance from true env # print(env.std) # imitation_env.set_param_values(torch.from_numpy(np.concatenate([learned_params, env.std])).float()) if 'upperLevelPolicy' in model: upperLevelPolicy = model['upperLevelPolicy'] print( "found upper level policy setting parameters to mean of upper level policy" ) print("mean", upperLevelPolicy.mean) torch_utils.set_flat_params_to(imitation_env, upperLevelPolicy.mean) elif not shooting_experiment: imitation_env.load_state_dict(torch.load(model_param_path)) print("theta", imitation_env.theta) print("std", imitation_env.std) # load fixed expert trajectories expert_paths = joblib.load(expert_data_path) # # collect batch of expert_data containing n trajectories sampler = TrajSampler(policy, imitation_env, sample_n_traj, 500, discount=0.995, useImitationPolicy=False, useImitationEnv=False, terminate_only_max_path=True) expert_processed_paths = sampler.process_samples(0, expert_paths) ## check if we need to do the rollouts or if we can just load in the processed_paths if os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) and \ os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")): generated_paths = joblib.load( os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) generated_processed_paths = joblib.load( os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")) else: sampler.start_worker() generated_paths = sampler.obtain_samples(0) generated_processed_paths = sampler.process_samples( 0, generated_paths) # save processed paths s.t. we don't need to do the rollouts every time joblib.dump(generated_paths, os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl"), compress=3) joblib.dump( generated_processed_paths, os.path.join( path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl"), compress=3) avg_discounted_return = sampler.calc_avg_discounted_return( generated_processed_paths) avg_undiscounted_return = sampler.calc_avg_undiscounted_return( generated_processed_paths) avg_traj_length = sampler.calc_avg_traj_length( generated_processed_paths) avg_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole") avg_displacement = np.mean(avg_displacements) avg_norm_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole", normalize=True) avg_norm_displacement = np.mean(avg_norm_displacements) avg_min_displacements = calc_avg_displacement_timesteps( expert_processed_paths, generated_processed_paths, 500, "CartPole", "min") avg_min_displacement = np.mean(avg_min_displacements) avg_leave_boundaries = calc_leaving_boundaries_rate( generated_processed_paths) success_rate = calc_success_rate(generated_processed_paths) values = [ avg_discounted_return, avg_undiscounted_return, avg_traj_length, avg_displacement, avg_min_displacement, avg_leave_boundaries, success_rate, avg_norm_displacement ] if not os.path.exists(experiment_folder_name + "displacements_" + str(iteration) + "/"): os.makedirs(experiment_folder_name + "displacements_" + str(iteration) + "/") all_displacements = { 'avg_displacements': avg_displacements, 'avg_min_displacements': avg_min_displacements, 'avg_norm_displacements': avg_norm_displacements } joblib.dump( all_displacements, experiment_folder_name + "displacements_" + str(iteration) + "/" + fn + ".pkl") df.loc[expr_idx] = [expr_idx] + variant_values + values for j, value in zip(range(num_eval_measures), values): # build variant string variant_string = "v" + str(expr_idx) df_plot.loc[df_plot_idx + j] = [ expr_idx, variant_string, eval_measures_label[j], value ] expr_idx += 1 df_plot_idx += num_eval_measures
def set_param_values(self, params): # assume that params is a tensor of shape 5, theta has dim 3 and we have 2 dimensional noise if not params.shape[0] == 5: raise ValueError("something went wrong") torch_utils.set_flat_params_to(self, params)
def _train_SGD(self): # TODO: we need to get here the right observations, actions and next_observations for the model # expert_observations, expert_actions, expert_next_observations = create_torch_var_from_paths(self.expert_data) # now train imitation policy using collect batch of expert_data with MLE on log prob since we have a Gaussian # TODO: do we train mean and variance? or only mean torch_input_batch, torch_output_batch = self.create_torch_var_from_paths(self.expert_data) # split data randomly into training and validation set, let's go with 70 - 30 split numTotalSamples = torch_input_batch.size(0) trainingSize = int(numTotalSamples*0.7) randomIndices = np.random.permutation(np.arange(numTotalSamples)) trainingIndices = randomIndices[:trainingSize] validationIndices = randomIndices[trainingSize:] validation_input_batch = torch_input_batch[validationIndices] validation_output_batch = torch_output_batch[validationIndices] torch_input_batch = torch_input_batch[trainingIndices] torch_output_batch = torch_output_batch[trainingIndices] best_loss = np.inf losses = np.array([best_loss] * 25) with tqdm(total=self.n_itr, file=sys.stdout) as pbar: for epoch in range(self.n_itr+1): with logger.prefix('epoch #%d | ' % epoch): # split into mini batches for training total_batchsize = torch_input_batch.size(0) logger.record_tabular('Iteration', epoch) indices = np.random.permutation(np.arange(total_batchsize)) if isinstance(self.imitationModel, CartPoleModel): logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy())) logger.record_tabular("std", str(self.imitationModel.std.detach().numpy())) # go through the whole batch for k in range(int(total_batchsize/self.mini_batchsize)): idx = indices[self.mini_batchsize*k:self.mini_batchsize*(k+1)] # TODO: how about numerical stability? log_prob = self.imitationModel.get_log_prob(torch_input_batch[idx, :], torch_output_batch[idx, :]) # note that L2 regularization is in weight decay of optimizer loss = -torch.mean(log_prob) # negative since we want to minimize and not maximize self.optimizer.zero_grad() loss.backward() self.optimizer.step() # calculate the loss on the whole batch log_prob = self.imitationModel.get_log_prob(validation_input_batch, validation_output_batch) loss = -torch.mean(log_prob) # Note: here we add L2 regularization to the loss to log the proper loss # weight decay for param in self.imitationModel.parameters(): loss += param.pow(2).sum() * self.l2_reg logger.record_tabular("loss", loss.item()) # check if loss has decreased in the last 25 itr on the validation set, if not stop training # and return the best found parameters losses[1:] = losses[0:-1] losses[0] = loss if epoch == 0: best_loss = np.min(losses) best_flat_parameters = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy() logger.record_tabular("current_best_loss", best_loss) elif np.min(losses) <= best_loss and not (np.mean(losses) == best_loss): #second condition prevents same error in whole losses # set best loss to new one if smaller or keep it best_loss = np.min(losses) best_flat_parameters = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy() logger.record_tabular("current_best_loss", best_loss) else: pbar.close() print("best loss did not decrease in last 25 steps") print("saving best result...") logger.log("best loss did not decrease in last 25 steps") torch_utils.set_flat_params_to(self.imitationModel, torch_utils.torch.from_numpy(best_flat_parameters)) logger.log("SGD converged") logger.log("saving best result...") params, torch_params = self.get_itr_snapshot(epoch) if not params is None: params["algo"] = self logger.save_itr_params(self.n_itr, params, torch_params) logger.log("saved") break pbar.set_description('epoch: %d' % (1 + epoch)) pbar.update(1) # save result logger.log("saving snapshot...") params, torch_params = self.get_itr_snapshot(epoch) if not params is None: params["algo"] = self logger.save_itr_params(epoch, params, torch_params) logger.log("saved") logger.dump_tabular(with_prefix=False)
def _train_BGFS(self): if not isinstance(self.imitationModel, CartPoleModel): raise NotImplementedError("train BGFS can be only called with CartPoleModel") expert_observations = torch.from_numpy(self.expert_data["observations"]).float() expert_actions = torch.from_numpy(self.expert_data["actions"]).float() expert_obs_diff = torch.from_numpy(self.expert_data["env_infos"]["obs_diff"]).float() # now train imitation policy using collect batch of expert_data with MLE on log prob since we have a Gaussian # TODO: do we train mean and variance? or only mean if self.mode == "imitate_env": input = torch.cat([expert_observations, expert_actions], dim=1) output = expert_obs_diff else: return ValueError("invalid mode") imitation_model = self.imitationModel total_batchsize = input.size(0) def get_negative_likelihood_loss(flat_params): torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params)) for param in imitation_model.parameters(): if param.grad is not None: param.grad.data.fill_(0) indices = np.random.permutation(np.arange(total_batchsize)) loss = - torch.mean(imitation_model.get_log_prob(input[indices[:self.mini_batchsize]], output[indices[:self.mini_batchsize]])) # weight decay for param in imitation_model.parameters(): loss += param.pow(2).sum() * self.l2_reg loss.backward() # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version? return loss.detach().numpy(), \ torch_utils.get_flat_grad_from( imitation_model.parameters()).detach().numpy(). \ astype(np.float64) curr_itr = 0 def callback_fun(flat_params): nonlocal curr_itr torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params)) # calculate the loss of the whole batch loss = - torch.mean(imitation_model.get_log_prob(input, output)) # weight decay for param in imitation_model.parameters(): loss += param.pow(2).sum() * self.l2_reg loss.backward() if isinstance(self.imitationModel, CartPoleModel): logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy())) logger.record_tabular("std", str(self.imitationModel.std.detach().numpy())) logger.record_tabular('Iteration', curr_itr) logger.record_tabular("loss", loss.item()) logger.dump_tabular(with_prefix=False) curr_itr += 1 x0 = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy() # only allow positive variables since we know the masses and variance cannot be negative bounds = [(0, np.inf) for _ in x0] flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_negative_likelihood_loss, x0, maxiter=self.n_itr, bounds=bounds, callback=callback_fun) logger.log(str(opt_info)) torch_utils.set_flat_params_to(self.imitationModel, torch.from_numpy(flat_params)) # save result logger.log("saving snapshot...") params, torch_params = self.get_itr_snapshot(0) params["algo"] = self logger.save_itr_params(self.n_itr, params, torch_params) logger.log("saved")
def step(self, policy_net, value_net, states, actions, returns, advantages): """update critic""" values_target = Variable(returns) """calculates the mean kl difference between 2 parameter settings""" def get_kl_diff(old_param, new_param): prev_params = torch_utils.get_flat_params_from(policy_net) with torch.no_grad(): torch_utils.set_flat_params_to(policy_net, old_param) log_old_prob = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)) torch_utils.set_flat_params_to(policy_net, new_param) log_new_prob = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)) torch_utils.set_flat_params_to(policy_net, prev_params) return torch.mean(torch.exp(log_old_prob) * (log_old_prob-log_new_prob)).numpy() def get_value_loss(flat_params): torch_utils.set_flat_params_to(value_net, torch_utils.torch.from_numpy(flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_pred = value_net(Variable(states)) value_loss = (values_pred - values_target).pow(2).mean() # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * self.l2_reg value_loss.backward() # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version? return value_loss.data.cpu().numpy(), \ torch_utils.get_flat_grad_from( value_net.parameters()).data.cpu().numpy(). \ astype(np.float64) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, torch_utils.get_flat_params_from(value_net).cpu().numpy(), maxiter=25) torch_utils.set_flat_params_to(value_net, torch.from_numpy(flat_params)) """update policy""" fixed_log_probs = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)).data """define the loss function for TRPO""" def get_loss(volatile=False): # more numberical stable: have a minimum value, s.t. we don't get -inf log_probs = torch.clamp(policy_net.get_log_prob( Variable(states, volatile=volatile), Variable(actions)), min=np.log(1e-6)) ent = policy_net.get_entropy(Variable(states, volatile=volatile), Variable(actions)).mean() action_loss = -Variable(advantages) * torch.exp( log_probs - Variable(fixed_log_probs)) # logger.log("advantage"+str(advantages)) # logger.log("log_probs"+str(log_probs)) # logger.log("mean"+str(torch.mean(torch.exp( # log_probs - Variable(fixed_log_probs))))) # logger.log("action_loss_no_mean"+str(-action_loss)) return action_loss.mean() - self.entropy_coeff * ent """use fisher information matrix for Hessian*vector""" def Fvp_fim(v): M, mu, info = policy_net.get_fim(Variable(states)) mu = mu.view(-1) filter_input_ids = set() if policy_net.is_disc_action else \ {info['std_id']} t = M.new(mu.size()) t[:] = 1 t = Variable(t, requires_grad=True) mu_t = (mu * t).sum() Jt = torch_utils.compute_flat_grad(mu_t, policy_net.parameters(), filter_input_ids=filter_input_ids, create_graph=True) Jtv = (Jt * Variable(v)).sum() Jv = torch.autograd.grad(Jtv, t, retain_graph=True)[0] MJv = Variable(M * Jv.data) mu_MJv = (MJv * mu).sum() JTMJv = torch_utils.compute_flat_grad(mu_MJv, policy_net.parameters(), filter_input_ids=filter_input_ids, retain_graph=True).data JTMJv /= states.shape[0] if not policy_net.is_disc_action: std_index = info['std_index'] JTMJv[std_index: std_index + M.shape[0]] += \ 2 * v[std_index: std_index + M.shape[0]] return JTMJv + v * self.damping """directly compute Hessian*vector from KL""" def Fvp_direct(v): kl = policy_net.get_kl(Variable(states)) kl = kl.mean() grads = torch.autograd.grad(kl, policy_net.parameters(), create_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = (flat_grad_kl * Variable(v)).sum() grads = torch.autograd.grad(kl_v, policy_net.parameters()) flat_grad_grad_kl = torch.cat( [grad.contiguous().view(-1) for grad in grads]).data return flat_grad_grad_kl + v * self.damping Fvp = Fvp_fim if self.use_fim else Fvp_direct loss = get_loss() grads = torch.autograd.grad(loss, policy_net.parameters()) loss_grad = torch.cat([grad.view(-1) for grad in grads]).data stepdir = conjugate_gradients(Fvp, -loss_grad, 10) shs = (stepdir.dot(Fvp(stepdir))) lm = np.sqrt(2 * self.max_kl / (shs + 1e-8)) if np.isnan(lm): lm = 1. fullstep = stepdir * lm expected_improve = -loss_grad.dot(fullstep) prev_params = torch_utils.get_flat_params_from(policy_net) success, new_params = \ line_search(policy_net, get_loss, prev_params, fullstep, expected_improve, get_kl_diff, self.max_kl) logger.record_tabular('TRPO_linesearch_success', int(success)) logger.record_tabular("KL_diff", get_kl_diff(prev_params,new_params)) torch_utils.set_flat_params_to(policy_net, new_params) logger.log("old_parameters" + str(prev_params.detach().numpy())) logger.log("new_parameters" + str(new_params.detach().numpy())) return success