def line_search(model, f, x, fullstep, expected_improve_full, get_kl_diff, max_kl, max_backtracks=20,
                accept_ratio=0.01):
    fval = f(True).data[0]
    steps = [.5 ** x for x in range(max_backtracks)]
    for stepfrac in steps:
        x_new = x + stepfrac * fullstep
        # check x_new for NAN, it happened when optimizing cartpole model that somehow theta became NAN
        if torch.sum(torch.isnan(x_new)) > 0:
            logger.log("we somehow got NAN in linesearch", x, stepfrac, fullstep)
            continue
        torch_utils.set_flat_params_to(model, x_new)
        fval_new = f(True).data[0]
        actual_improve = fval - fval_new
        mean_kl = get_kl_diff(x,x_new)
        tolerance = max_kl*0.5
        if actual_improve > 0 and mean_kl <= max_kl + tolerance:
            return True, x_new
        logger.log("backtrack")
        if actual_improve <= 0 :
            logger.log("Violated because loss not improving. New loss: %f Old loss: %f" % (fval_new, fval))
        if mean_kl > max_kl + tolerance:
            logger.log("Violated because kl bound does not hold. MaxKL: %f MeanKL: %f" % (max_kl, mean_kl))
    if actual_improve <= 0:
        logger.log("Violated because loss not improving. New loss: %f Old loss: %f" % (fval_new, fval))
    if mean_kl > max_kl + tolerance:
        logger.log("Violated because kl bound does not hold. MaxKL: %f MeanKL: %f" % (max_kl, mean_kl))
    return False, x
Exemplo n.º 2
0
    def set_param_values(self, params):
        # assume that params is a tensor of shape 5, theta has dim 7 and we have 2 dimensional noise
        if not params.shape[0] == 9 and not self.trainMotor:
            raise ValueError("something went wrong")
        elif not params.shape[0] == 11 and self.trainMotor:
            raise ValueError("something went wrong")

        # prevent negative parameters by using absolute value
        torch_utils.set_flat_params_to(self, params)
Exemplo n.º 3
0
 def callback_fun(flat_params):
     nonlocal curr_itr
     torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params))
     # calculate the loss of the whole batch
     loss = - torch.mean(imitation_model.get_log_prob(input, output))
     # weight decay
     for param in imitation_model.parameters():
         loss += param.pow(2).sum() * self.l2_reg
     loss.backward()
     if isinstance(self.imitationModel, CartPoleModel):
         logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy()))
         logger.record_tabular("std", str(self.imitationModel.std.detach().numpy()))
     logger.record_tabular('Iteration', curr_itr)
     logger.record_tabular("loss", loss.item())
     logger.dump_tabular(with_prefix=False)
     curr_itr += 1
        def get_value_loss(flat_params):
            torch_utils.set_flat_params_to(value_net,
                                           torch_utils.torch.from_numpy(flat_params))
            for param in value_net.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)
            values_pred = value_net(Variable(states))
            value_loss = (values_pred - values_target).pow(2).mean()

            # weight decay
            for param in value_net.parameters():
                value_loss += param.pow(2).sum() * self.l2_reg
            value_loss.backward()

            # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version?
            return value_loss.data.cpu().numpy(), \
                   torch_utils.get_flat_grad_from(
                       value_net.parameters()).data.cpu().numpy(). \
                       astype(np.float64)
Exemplo n.º 5
0
        def get_negative_likelihood_loss(flat_params):
            torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params))
            for param in imitation_model.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)

            indices = np.random.permutation(np.arange(total_batchsize))

            loss = - torch.mean(imitation_model.get_log_prob(input[indices[:self.mini_batchsize]], output[indices[:self.mini_batchsize]]))

            # weight decay
            for param in imitation_model.parameters():
                loss += param.pow(2).sum() * self.l2_reg
            loss.backward()

            # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version?
            return loss.detach().numpy(), \
                   torch_utils.get_flat_grad_from(
                       imitation_model.parameters()).detach().numpy(). \
                       astype(np.float64)
 def get_kl_diff(old_param, new_param):
     prev_params = torch_utils.get_flat_params_from(policy_net)
     with torch.no_grad():
         torch_utils.set_flat_params_to(policy_net, old_param)
         log_old_prob = torch.clamp(policy_net.get_log_prob(
             Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6))
         torch_utils.set_flat_params_to(policy_net, new_param)
         log_new_prob = torch.clamp(policy_net.get_log_prob(
             Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6))
     torch_utils.set_flat_params_to(policy_net, prev_params)
     return torch.mean(torch.exp(log_old_prob) * (log_old_prob-log_new_prob)).numpy()
Exemplo n.º 7
0
def summarize_and_plot_results(folderName,
                               iteration,
                               sample_n_traj,
                               policy_path,
                               expert_data_path,
                               plot_trajectories=True):
    experiment_folder_name = str(folderName) + "/"

    eval_labels = [
        'avg_discounted_return', 'avg_undiscounted_return', 'avg_traj_length',
        'avg_displacement', 'avg_min_displacement', 'avg_boundaries_left',
        'avg_success_rate', 'avg_norm_displacement'
    ]

    print(experiment_folder_name)

    num_experiment_trial = 0
    # find all trials of the experiment
    for fn in os.listdir(experiment_folder_name):
        path = os.path.join(experiment_folder_name, fn)
        model_path = os.path.join(path, "itr_0.pkl")
        if not os.path.exists(model_path) and not os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + ".pkl")):
            continue
        if num_experiment_trial == 0:
            # read one variant.json testwise to obtain the columns for the data frame
            fileName = path + '/variant.json'
            with open(fileName, 'r') as read_file:
                data = json.load(read_file)
                col = ['experiment_itr']
                variant_keys = data.keys()
                col += data.keys()
                col += eval_labels
        num_experiment_trial += 1

    # create a empty data frame for the results
    df = pd.DataFrame(index=np.arange(0, num_experiment_trial), columns=col)

    # create different data frame for plotting
    eval_measures_label = eval_labels
    num_eval_measures = len(eval_measures_label)
    df_plot = pd.DataFrame(
        index=np.arange(0, num_experiment_trial * num_eval_measures),
        columns=('expr_itr', 'variant', 'eval_type', 'value'))

    expr_idx = 0
    df_plot_idx = 0

    # create a data_frame which is used to plot results of the training iterations
    eval_measures_itr_label = ['NumTrajs', 'AverageReturn', 'Entropy']
    dataFramePlotItr = []

    for fn in sorted(os.listdir(experiment_folder_name)):
        path = os.path.join(experiment_folder_name, fn)

        model_path = os.path.join(path, "itr_0.pkl")
        if not os.path.exists(model_path) and not os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + ".pkl")):
            continue
        # check if we have a model for the iteration, if yes load this
        if os.path.exists(os.path.join(path,
                                       "itr_" + str(iteration) + ".pkl")):
            model_path = os.path.join(path, "itr_" + str(iteration) + ".pkl")
        else:
            # take the newest iteration we have
            for itr in range(iteration, 0, -1):
                if os.path.exists(
                        os.path.join(path, "itr_" + str(itr) + ".pkl")):
                    model_path = os.path.join(path, "itr_" + str(itr) + ".pkl")
                    break
        model_param_path = "empty"
        # check if we have a model parameters
        if os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + "_model.pkl")):
            model_param_path = os.path.join(
                path, "itr_" + str(iteration) + "_model.pkl")
        else:
            # take the newest iteration we have
            for itr in range(iteration, 0, -1):
                if os.path.exists(
                        os.path.join(path, "itr_" + str(itr) + "_model.pkl")):
                    model_param_path = os.path.join(
                        path, "itr_" + str(itr) + "_model.pkl")
                    break

        fileName = path + '/variant.json'
        with open(fileName, 'r') as read_file:
            data = json.load(read_file)

        # get the values and convert them to strings to store them easier
        variant_values = [
            str(data[x]) if hasattr(data, x) else "None" for x in variant_keys
        ]

        if os.stat(path + '/progress.csv').st_size > 0:
            # only process csv if it is not empty
            unprocessed_df = pd.read_csv(path + '/progress.csv')
            temp_frames = []
            isGailExperiment = True
            try:
                for label in eval_measures_itr_label:
                    temp_frame = pd.DataFrame(unprocessed_df[label])
                    temp_frame.columns = ['value']
                    temp_frame['eval_type'] = label
                    temp_frames.append(temp_frame)
                processed_df = pd.concat(temp_frames)
                processed_df['Iteration'] = unprocessed_df['Iteration']
                processed_df['expr_idx'] = expr_idx
                processed_df['variant'] = "v" + str(expr_idx)
                dataFramePlotItr.append(processed_df)
            except KeyError:
                isGailExperiment = False

        # load policy for gaussian noise for sigma^2: 0.05 and 0.01
        data = joblib.load(policy_path)
        policy = data['policy']
        # policy.action_log_std = torch.nn.Parameter(
        #     torch.zeros(1, 1))  # set variance to zero to have a deterministic policy
        policy.normalized_input = [False, False, False, False]
        policy.normalized_output = [True]

        shooting_experiment = False
        # load learned model
        model = joblib.load(model_path)
        if 'imitationModel' in model:
            imitation_env = model['imitationModel']
        elif 'imitationEnv' in model:
            imitation_env = model['imitationEnv']
        elif 'parameters' in model:
            learned_params = model['parameters'].squeeze(0)
            shooting_experiment = True
            # create a new environment and set the parameters to the learned ones
            from rllab.dynamic_models.cartpole_model import CartPoleModel
            imitation_env = CartPoleModel()
            # set the variance to 0 to have a determinisitic environment
            imitation_env.set_param_values(
                torch.from_numpy(np.concatenate([learned_params,
                                                 np.zeros(2)])).float())
            # use original variance to have a stochastic environment
            # get variance from true env
            # print(env.std)
            # imitation_env.set_param_values(torch.from_numpy(np.concatenate([learned_params, env.std])).float())
        if 'upperLevelPolicy' in model:
            upperLevelPolicy = model['upperLevelPolicy']

            torch_utils.set_flat_params_to(imitation_env,
                                           upperLevelPolicy.mean)
        elif not shooting_experiment:
            imitation_env.load_state_dict(torch.load(model_param_path))

        # load fixed expert trajectories
        expert_paths = joblib.load(expert_data_path)

        # # collect batch of expert_data containing n trajectories
        sampler = TrajSampler(policy,
                              imitation_env,
                              sample_n_traj,
                              500,
                              discount=0.995,
                              useImitationPolicy=False,
                              useImitationEnv=False,
                              terminate_only_max_path=True)
        expert_processed_paths = sampler.process_samples(0, expert_paths)

        ## check if we need to do the rollouts or if we can just load in the processed_paths
        if os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) and \
                os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")):
            generated_paths = joblib.load(
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_paths.pkl"))
            generated_processed_paths = joblib.load(
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_processed_paths.pkl"))
        else:
            sampler.start_worker()
            generated_paths = sampler.obtain_samples(0)
            generated_processed_paths = sampler.process_samples(
                0, generated_paths)
            # save processed paths s.t. we don't need to do the rollouts every time
            joblib.dump(generated_paths,
                        os.path.join(
                            path, "itr_" + str(iteration) + "_n_traj_" +
                            str(sample_n_traj) + "_generated_paths.pkl"),
                        compress=3)
            joblib.dump(
                generated_processed_paths,
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_processed_paths.pkl"),
                compress=3)

        avg_discounted_return = sampler.calc_avg_discounted_return(
            generated_processed_paths)
        avg_undiscounted_return = sampler.calc_avg_undiscounted_return(
            generated_processed_paths)
        avg_traj_length = sampler.calc_avg_traj_length(
            generated_processed_paths)

        avg_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths, generated_processed_paths, 500, "CartPole")
        avg_displacement = np.mean(avg_displacements)
        avg_norm_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths,
            generated_processed_paths,
            500,
            "CartPole",
            normalize=True)
        avg_norm_displacement = np.mean(avg_norm_displacements)
        avg_min_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths, generated_processed_paths, 500, "CartPole",
            "min")
        avg_min_displacement = np.mean(avg_min_displacements)
        avg_leave_boundaries = calc_leaving_boundaries_rate(
            generated_processed_paths)
        success_rate = calc_success_rate(generated_processed_paths)

        values = [
            avg_discounted_return, avg_undiscounted_return, avg_traj_length,
            avg_displacement, avg_min_displacement, avg_leave_boundaries,
            success_rate, avg_norm_displacement
        ]

        if not os.path.exists(experiment_folder_name + "displacements_" +
                              str(iteration) + "/"):
            os.makedirs(experiment_folder_name + "displacements_" +
                        str(iteration) + "/")
        all_displacements = {
            'avg_displacements': avg_displacements,
            'avg_min_displacements': avg_min_displacements,
            'avg_norm_displacements': avg_norm_displacements
        }
        joblib.dump(
            all_displacements, experiment_folder_name + "displacements_" +
            str(iteration) + "/" + fn + ".pkl")

        df.loc[expr_idx] = [expr_idx] + variant_values + values

        for j, value in zip(range(num_eval_measures), values):
            # build variant string
            variant_string = "v" + str(expr_idx)
            df_plot.loc[df_plot_idx + j] = [
                expr_idx, variant_string, eval_measures_label[j], value
            ]

        expr_idx += 1
        df_plot_idx += num_eval_measures

        # write results to folder for plotting
        df_plot.to_csv(experiment_folder_name + "results_plot_frame_" +
                       str(iteration) + ".csv",
                       sep=',',
                       encoding='utf-8')

        if plot_trajectories:
            plot_n_traj = 20  # plot the same number as in the supplement
            # check if plot already are already existing if yes don't plot
            if os.path.exists(experiment_folder_name + "no_cut_"+ str(iteration)+"/" + fn + ".png") and \
                    os.path.exists(experiment_folder_name + "cut_"+ str(iteration)+"/" + fn + ".png"):
                continue

            # create plot with 25 trajectory from real env and learned env
            sns.set(context="notebook", style="darkgrid")
            title = ['x_pos', 'x_dot', 'sin theta', 'cos theta', 'theta_dot']
            fig, axes = plt.subplots(len(title), 1, tight_layout=True)
            # the size of A4 paper + 4 inch addtional height
            fig.set_size_inches(11.7, 12.27)
            # merge all observations of all paths into one big data frame
            dataFrames = []
            print("building data frame")
            for i in range(plot_n_traj):
                trajlen = expert_paths[i]["observations"].shape[0]
                modObservations = np.zeros((trajlen, len(title)))
                modObservations[:, 0:2] = expert_paths[i]["observations"][:,
                                                                          0:2]
                modObservations[:,
                                2] = np.sin(expert_paths[i]["observations"][:,
                                                                            2])
                modObservations[:,
                                3] = np.cos(expert_paths[i]["observations"][:,
                                                                            2])
                modObservations[:, 4] = expert_paths[i]["observations"][:, 3]
                trajDataFrame = pd.DataFrame(np.concatenate([
                    i * np.ones(trajlen)[:, np.newaxis],
                    np.arange(trajlen)[:, np.newaxis], modObservations
                ],
                                                            axis=1),
                                             columns=['traj', 'timestep'] +
                                             title)
                dataFrames.append(trajDataFrame)
            all_traj_true_env = pd.concat(dataFrames)
            all_traj_true_env["model"] = "true environment"

            # now add 10 traj from learned env
            dataFrames = []
            for i in range(plot_n_traj):
                trajlen = generated_paths[i]["observations"].shape[0]
                modObservations = np.zeros((trajlen, len(title)))
                modObservations[:,
                                0:2] = generated_paths[i]["observations"][:,
                                                                          0:2]
                modObservations[:, 2] = np.sin(
                    generated_paths[i]["observations"][:, 2])
                modObservations[:, 3] = np.cos(
                    generated_paths[i]["observations"][:, 2])
                modObservations[:, 4] = generated_paths[i]["observations"][:,
                                                                           3]
                trajDataFrame = pd.DataFrame(np.concatenate([
                    i * np.ones(trajlen)[:, np.newaxis],
                    np.arange(trajlen)[:, np.newaxis], modObservations
                ],
                                                            axis=1),
                                             columns=['traj', 'timestep'] +
                                             title)
                dataFrames.append(trajDataFrame)
            all_traj_learned_env = pd.concat(dataFrames)
            all_traj_learned_env["model"] = "learned environment"
            all_traj = pd.concat([all_traj_true_env, all_traj_learned_env])

            print("finished building data frame")

            print("start plotting")
            for i in range(len(title)):
                ax = sns.lineplot(x='timestep',
                                  y=title[i],
                                  hue="model",
                                  estimator=None,
                                  units='traj',
                                  data=all_traj,
                                  ax=axes[i],
                                  legend=False)
                if i == 0:
                    # add additional 2 dashed lines for the boundary
                    ax.plot([0, 500], [3, 3], color='k', linestyle='--')
                    ax.plot([0, 500], [-3, -3], color='k', linestyle='--')
                ax.set_xlim(-5, 505)
                ax.set_title(title[i])
            plt.savefig(path + "/cartpole_results.png",
                        bbox_inches='tight',
                        pad_inches=0.1)

            plt.close()
Exemplo n.º 8
0
def summarize_and_plot_results(folderPath,
                               iteration,
                               sample_n_traj,
                               policy_path,
                               expert_data_path,
                               plot_trajectories=True):
    experiment_folder_name = str(folderPath.parent) + "/"

    print(experiment_folder_name)

    eval_labels = [
        'avg_discounted_return', 'avg_undiscounted_return', 'avg_traj_length',
        'avg_displacement', 'avg_min_displacement', 'avg_boundaries_left',
        'avg_success_rate', 'avg_norm_displacement'
    ]

    num_experiment_trial = 0
    # find all trials of the experiment
    for fn in os.listdir(experiment_folder_name):
        path = os.path.join(experiment_folder_name, fn)
        model_path = os.path.join(path, "itr_0.pkl")
        if not os.path.exists(model_path) and not os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + ".pkl")):
            continue
        if num_experiment_trial == 0:
            # read one variant.json testwise to obtain the columns for the data frame
            fileName = path + '/variant.json'
            with open(fileName, 'r') as read_file:
                data = json.load(read_file)
                col = ['experiment_itr']
                variant_keys = data.keys()
                col += data.keys()
                col += eval_labels
        num_experiment_trial += 1

    # create a empty data frame for the results
    df = pd.DataFrame(index=np.arange(0, num_experiment_trial), columns=col)

    # create different data frame for plotting
    eval_measures_label = eval_labels
    num_eval_measures = len(eval_measures_label)
    df_plot = pd.DataFrame(
        index=np.arange(0, num_experiment_trial * num_eval_measures),
        columns=('expr_itr', 'variant', 'eval_type', 'value'))

    expr_idx = 0
    df_plot_idx = 0

    # create a data_frame which is used to plot results of the training iterations
    eval_measures_itr_label = ['NumTrajs', 'AverageReturn', 'Entropy']
    dataFramePlotItr = []

    for fn in sorted(os.listdir(experiment_folder_name)):
        path = os.path.join(experiment_folder_name, fn)

        model_path = os.path.join(path, "itr_0.pkl")
        if not os.path.exists(model_path) and not os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + ".pkl")):
            continue
        # check if we have a model for the iteration, if yes load this
        if os.path.exists(os.path.join(path,
                                       "itr_" + str(iteration) + ".pkl")):
            model_path = os.path.join(path, "itr_" + str(iteration) + ".pkl")
        else:
            # take the newest iteration we have
            for itr in range(iteration, 0, -1):
                if os.path.exists(
                        os.path.join(path, "itr_" + str(itr) + ".pkl")):
                    model_path = os.path.join(path, "itr_" + str(itr) + ".pkl")
                    break
        model_param_path = "empty"
        # check if we have a model parameters
        if os.path.exists(
                os.path.join(path, "itr_" + str(iteration) + "_model.pkl")):
            model_param_path = os.path.join(
                path, "itr_" + str(iteration) + "_model.pkl")
        else:
            # take the newest iteration we have
            for itr in range(iteration, 0, -1):
                if os.path.exists(
                        os.path.join(path, "itr_" + str(itr) + "_model.pkl")):
                    model_param_path = os.path.join(
                        path, "itr_" + str(itr) + "_model.pkl")
                    break

        print(model_path)
        print(model_param_path)

        fileName = path + '/variant.json'
        with open(fileName, 'r') as read_file:
            data = json.load(read_file)

        # get the values and convert them to strings to store them easier
        variant_values = [
            str(data[x]) if hasattr(data, x) else "None" for x in variant_keys
        ]

        if os.stat(path + '/progress.csv').st_size > 0:
            # only process csv if it is not empty
            unprocessed_df = pd.read_csv(path + '/progress.csv')
            temp_frames = []
            isGailExperiment = True
            try:
                for label in eval_measures_itr_label:
                    temp_frame = pd.DataFrame(unprocessed_df[label])
                    temp_frame.columns = ['value']
                    temp_frame['eval_type'] = label
                    temp_frames.append(temp_frame)
                processed_df = pd.concat(temp_frames)
                processed_df['Iteration'] = unprocessed_df['Iteration']
                processed_df['expr_idx'] = expr_idx
                processed_df['variant'] = "v" + str(expr_idx)
                dataFramePlotItr.append(processed_df)
            except KeyError:
                isGailExperiment = False

        # load policy for gaussian noise for sigma^2: 0.05 and 0.01
        data = joblib.load(policy_path)
        policy = data['policy']
        # policy.action_log_std = torch.nn.Parameter(
        #     torch.zeros(1, 1))  # set variance to zero to have a deterministic policy
        policy.normalized_input = [False, False, False, False]
        policy.normalized_output = [True]

        shooting_experiment = False
        # load learned model
        model = joblib.load(model_path)
        if 'imitationModel' in model:
            imitation_env = model['imitationModel']
        else:
            imitation_env = model['imitationEnv']
        if 'parameters' in model:
            learned_params = model['parameters'].squeeze(0)
            shooting_experiment = True
            # create a new environment and set the parameters to the learned ones
            from rllab.dynamic_models.cartpole_model import CartPoleModel
            imitation_env = CartPoleModel()
            # set the variance to 0 to have a determinisitic environment
            imitation_env.set_param_values(
                torch.from_numpy(np.concatenate([learned_params,
                                                 np.zeros(2)])).float())
            # use original variance to have a stochastic environment
            # get variance from true env
            # print(env.std)
            # imitation_env.set_param_values(torch.from_numpy(np.concatenate([learned_params, env.std])).float())
        if 'upperLevelPolicy' in model:
            upperLevelPolicy = model['upperLevelPolicy']
            print(
                "found upper level policy setting parameters to mean of upper level policy"
            )
            print("mean", upperLevelPolicy.mean)
            torch_utils.set_flat_params_to(imitation_env,
                                           upperLevelPolicy.mean)
        elif not shooting_experiment:
            imitation_env.load_state_dict(torch.load(model_param_path))

        print("theta", imitation_env.theta)
        print("std", imitation_env.std)

        # load fixed expert trajectories
        expert_paths = joblib.load(expert_data_path)

        # # collect batch of expert_data containing n trajectories
        sampler = TrajSampler(policy,
                              imitation_env,
                              sample_n_traj,
                              500,
                              discount=0.995,
                              useImitationPolicy=False,
                              useImitationEnv=False,
                              terminate_only_max_path=True)
        expert_processed_paths = sampler.process_samples(0, expert_paths)

        ## check if we need to do the rollouts or if we can just load in the processed_paths
        if os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_paths.pkl")) and \
                os.path.exists(os.path.join(path, "itr_" + str(iteration) + "_n_traj_" + str(sample_n_traj) + "_generated_processed_paths.pkl")):
            generated_paths = joblib.load(
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_paths.pkl"))
            generated_processed_paths = joblib.load(
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_processed_paths.pkl"))
        else:
            sampler.start_worker()
            generated_paths = sampler.obtain_samples(0)
            generated_processed_paths = sampler.process_samples(
                0, generated_paths)
            # save processed paths s.t. we don't need to do the rollouts every time
            joblib.dump(generated_paths,
                        os.path.join(
                            path, "itr_" + str(iteration) + "_n_traj_" +
                            str(sample_n_traj) + "_generated_paths.pkl"),
                        compress=3)
            joblib.dump(
                generated_processed_paths,
                os.path.join(
                    path, "itr_" + str(iteration) + "_n_traj_" +
                    str(sample_n_traj) + "_generated_processed_paths.pkl"),
                compress=3)

        avg_discounted_return = sampler.calc_avg_discounted_return(
            generated_processed_paths)
        avg_undiscounted_return = sampler.calc_avg_undiscounted_return(
            generated_processed_paths)
        avg_traj_length = sampler.calc_avg_traj_length(
            generated_processed_paths)

        avg_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths, generated_processed_paths, 500, "CartPole")
        avg_displacement = np.mean(avg_displacements)
        avg_norm_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths,
            generated_processed_paths,
            500,
            "CartPole",
            normalize=True)
        avg_norm_displacement = np.mean(avg_norm_displacements)
        avg_min_displacements = calc_avg_displacement_timesteps(
            expert_processed_paths, generated_processed_paths, 500, "CartPole",
            "min")
        avg_min_displacement = np.mean(avg_min_displacements)
        avg_leave_boundaries = calc_leaving_boundaries_rate(
            generated_processed_paths)
        success_rate = calc_success_rate(generated_processed_paths)

        values = [
            avg_discounted_return, avg_undiscounted_return, avg_traj_length,
            avg_displacement, avg_min_displacement, avg_leave_boundaries,
            success_rate, avg_norm_displacement
        ]

        if not os.path.exists(experiment_folder_name + "displacements_" +
                              str(iteration) + "/"):
            os.makedirs(experiment_folder_name + "displacements_" +
                        str(iteration) + "/")
        all_displacements = {
            'avg_displacements': avg_displacements,
            'avg_min_displacements': avg_min_displacements,
            'avg_norm_displacements': avg_norm_displacements
        }
        joblib.dump(
            all_displacements, experiment_folder_name + "displacements_" +
            str(iteration) + "/" + fn + ".pkl")

        df.loc[expr_idx] = [expr_idx] + variant_values + values

        for j, value in zip(range(num_eval_measures), values):
            # build variant string
            variant_string = "v" + str(expr_idx)
            df_plot.loc[df_plot_idx + j] = [
                expr_idx, variant_string, eval_measures_label[j], value
            ]

        expr_idx += 1
        df_plot_idx += num_eval_measures
Exemplo n.º 9
0
    def set_param_values(self, params):
        # assume that params is a tensor of shape 5, theta has dim 3 and we have 2 dimensional noise
        if not params.shape[0] == 5:
            raise ValueError("something went wrong")

        torch_utils.set_flat_params_to(self, params)
Exemplo n.º 10
0
    def _train_SGD(self):

        # TODO: we need to get here the right observations, actions and next_observations for the model
        # expert_observations, expert_actions, expert_next_observations = create_torch_var_from_paths(self.expert_data)
        # now train imitation policy using collect batch of expert_data with MLE on log prob since we have a Gaussian
        # TODO: do we train mean and variance? or only mean

        torch_input_batch, torch_output_batch = self.create_torch_var_from_paths(self.expert_data)

        # split data randomly into training and validation set, let's go with 70 - 30 split
        numTotalSamples = torch_input_batch.size(0)
        trainingSize = int(numTotalSamples*0.7)
        randomIndices = np.random.permutation(np.arange(numTotalSamples))
        trainingIndices = randomIndices[:trainingSize]
        validationIndices = randomIndices[trainingSize:]
        validation_input_batch = torch_input_batch[validationIndices]
        validation_output_batch = torch_output_batch[validationIndices]
        torch_input_batch = torch_input_batch[trainingIndices]
        torch_output_batch = torch_output_batch[trainingIndices]

        best_loss = np.inf
        losses = np.array([best_loss] * 25)
        with tqdm(total=self.n_itr, file=sys.stdout) as pbar:
            for epoch in range(self.n_itr+1):
                with logger.prefix('epoch #%d | ' % epoch):
                    # split into mini batches for training
                    total_batchsize = torch_input_batch.size(0)

                    logger.record_tabular('Iteration', epoch)
                    indices = np.random.permutation(np.arange(total_batchsize))
                    if isinstance(self.imitationModel, CartPoleModel):
                        logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy()))
                        logger.record_tabular("std", str(self.imitationModel.std.detach().numpy()))
                    # go through the whole batch
                    for k in range(int(total_batchsize/self.mini_batchsize)):
                        idx = indices[self.mini_batchsize*k:self.mini_batchsize*(k+1)]
                        # TODO: how about numerical stability?

                        log_prob = self.imitationModel.get_log_prob(torch_input_batch[idx, :], torch_output_batch[idx, :])

                        # note that L2 regularization is in weight decay of optimizer
                        loss = -torch.mean(log_prob) # negative since we want to minimize and not maximize
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

                    # calculate the loss on the whole batch
                    log_prob = self.imitationModel.get_log_prob(validation_input_batch, validation_output_batch)
                    loss = -torch.mean(log_prob)
                    # Note: here we add L2 regularization to the loss to log the proper loss
                    # weight decay
                    for param in self.imitationModel.parameters():
                        loss += param.pow(2).sum() * self.l2_reg
                    logger.record_tabular("loss", loss.item())

                    # check if loss has decreased in the last 25 itr on the validation set, if not stop training
                    # and return the best found parameters
                    losses[1:] = losses[0:-1]
                    losses[0] = loss

                    if epoch == 0:
                        best_loss = np.min(losses)
                        best_flat_parameters = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy()
                        logger.record_tabular("current_best_loss", best_loss)
                    elif np.min(losses) <= best_loss and not (np.mean(losses) == best_loss): #second condition prevents same error in whole losses
                        # set best loss to new one if smaller or keep it
                        best_loss = np.min(losses)
                        best_flat_parameters = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy()
                        logger.record_tabular("current_best_loss", best_loss)
                    else:
                        pbar.close()
                        print("best loss did not decrease in last 25 steps")
                        print("saving best result...")
                        logger.log("best loss did not decrease in last 25 steps")
                        torch_utils.set_flat_params_to(self.imitationModel, torch_utils.torch.from_numpy(best_flat_parameters))
                        logger.log("SGD converged")
                        logger.log("saving best result...")
                        params, torch_params = self.get_itr_snapshot(epoch)
                        if not params is None:
                            params["algo"] = self
                        logger.save_itr_params(self.n_itr, params, torch_params)
                        logger.log("saved")
                        break

                    pbar.set_description('epoch: %d' % (1 + epoch))
                    pbar.update(1)

                # save result
                logger.log("saving snapshot...")
                params, torch_params = self.get_itr_snapshot(epoch)
                if not params is None:
                    params["algo"] = self
                logger.save_itr_params(epoch, params, torch_params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
Exemplo n.º 11
0
    def _train_BGFS(self):
        if not isinstance(self.imitationModel, CartPoleModel):
            raise NotImplementedError("train BGFS can be only called with CartPoleModel")
        expert_observations = torch.from_numpy(self.expert_data["observations"]).float()
        expert_actions = torch.from_numpy(self.expert_data["actions"]).float()
        expert_obs_diff = torch.from_numpy(self.expert_data["env_infos"]["obs_diff"]).float()
        # now train imitation policy using collect batch of expert_data with MLE on log prob since we have a Gaussian
        # TODO: do we train mean and variance? or only mean

        if self.mode == "imitate_env":
            input = torch.cat([expert_observations, expert_actions], dim=1)
            output = expert_obs_diff
        else:
            return ValueError("invalid mode")

        imitation_model = self.imitationModel
        total_batchsize = input.size(0)

        def get_negative_likelihood_loss(flat_params):
            torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params))
            for param in imitation_model.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)

            indices = np.random.permutation(np.arange(total_batchsize))

            loss = - torch.mean(imitation_model.get_log_prob(input[indices[:self.mini_batchsize]], output[indices[:self.mini_batchsize]]))

            # weight decay
            for param in imitation_model.parameters():
                loss += param.pow(2).sum() * self.l2_reg
            loss.backward()

            # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version?
            return loss.detach().numpy(), \
                   torch_utils.get_flat_grad_from(
                       imitation_model.parameters()).detach().numpy(). \
                       astype(np.float64)

        curr_itr = 0

        def callback_fun(flat_params):
            nonlocal curr_itr
            torch_utils.set_flat_params_to(imitation_model, torch_utils.torch.from_numpy(flat_params))
            # calculate the loss of the whole batch
            loss = - torch.mean(imitation_model.get_log_prob(input, output))
            # weight decay
            for param in imitation_model.parameters():
                loss += param.pow(2).sum() * self.l2_reg
            loss.backward()
            if isinstance(self.imitationModel, CartPoleModel):
                logger.record_tabular("theta", str(self.imitationModel.theta.detach().numpy()))
                logger.record_tabular("std", str(self.imitationModel.std.detach().numpy()))
            logger.record_tabular('Iteration', curr_itr)
            logger.record_tabular("loss", loss.item())
            logger.dump_tabular(with_prefix=False)
            curr_itr += 1

        x0 = torch_utils.get_flat_params_from(self.imitationModel).detach().numpy()
        # only allow positive variables since we know the masses and variance cannot be negative
        bounds = [(0, np.inf) for _ in x0]

        flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
            get_negative_likelihood_loss,
            x0, maxiter=self.n_itr, bounds=bounds, callback=callback_fun)
        logger.log(str(opt_info))
        torch_utils.set_flat_params_to(self.imitationModel, torch.from_numpy(flat_params))

        # save result
        logger.log("saving snapshot...")
        params, torch_params = self.get_itr_snapshot(0)
        params["algo"] = self
        logger.save_itr_params(self.n_itr, params, torch_params)
        logger.log("saved")
    def step(self, policy_net, value_net, states, actions, returns, advantages):

        """update critic"""
        values_target = Variable(returns)

        """calculates the mean kl difference between 2 parameter settings"""
        def get_kl_diff(old_param, new_param):
            prev_params = torch_utils.get_flat_params_from(policy_net)
            with torch.no_grad():
                torch_utils.set_flat_params_to(policy_net, old_param)
                log_old_prob = torch.clamp(policy_net.get_log_prob(
                    Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6))
                torch_utils.set_flat_params_to(policy_net, new_param)
                log_new_prob = torch.clamp(policy_net.get_log_prob(
                    Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6))
            torch_utils.set_flat_params_to(policy_net, prev_params)
            return torch.mean(torch.exp(log_old_prob) * (log_old_prob-log_new_prob)).numpy()

        def get_value_loss(flat_params):
            torch_utils.set_flat_params_to(value_net,
                                           torch_utils.torch.from_numpy(flat_params))
            for param in value_net.parameters():
                if param.grad is not None:
                    param.grad.data.fill_(0)
            values_pred = value_net(Variable(states))
            value_loss = (values_pred - values_target).pow(2).mean()

            # weight decay
            for param in value_net.parameters():
                value_loss += param.pow(2).sum() * self.l2_reg
            value_loss.backward()

            # FIX: removed [0] since, mean reduces already it to an int (new functionality of new torch version?
            return value_loss.data.cpu().numpy(), \
                   torch_utils.get_flat_grad_from(
                       value_net.parameters()).data.cpu().numpy(). \
                       astype(np.float64)

        flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(
            get_value_loss,
            torch_utils.get_flat_params_from(value_net).cpu().numpy(), maxiter=25)
        torch_utils.set_flat_params_to(value_net, torch.from_numpy(flat_params))

        """update policy"""
        fixed_log_probs = torch.clamp(policy_net.get_log_prob(
            Variable(states, volatile=True), Variable(actions)), min=np.log(1e-6)).data
        """define the loss function for TRPO"""

        def get_loss(volatile=False):
            # more numberical stable: have a minimum value, s.t. we don't get -inf
            log_probs = torch.clamp(policy_net.get_log_prob(
                Variable(states, volatile=volatile), Variable(actions)), min=np.log(1e-6))
            ent = policy_net.get_entropy(Variable(states, volatile=volatile), Variable(actions)).mean()
            action_loss = -Variable(advantages) * torch.exp(
                log_probs - Variable(fixed_log_probs))
            # logger.log("advantage"+str(advantages))
            # logger.log("log_probs"+str(log_probs))
            # logger.log("mean"+str(torch.mean(torch.exp(
            #     log_probs - Variable(fixed_log_probs)))))
            # logger.log("action_loss_no_mean"+str(-action_loss))
            return action_loss.mean() - self.entropy_coeff * ent

        """use fisher information matrix for Hessian*vector"""

        def Fvp_fim(v):
            M, mu, info = policy_net.get_fim(Variable(states))
            mu = mu.view(-1)
            filter_input_ids = set() if policy_net.is_disc_action else \
                {info['std_id']}

            t = M.new(mu.size())
            t[:] = 1
            t = Variable(t, requires_grad=True)
            mu_t = (mu * t).sum()
            Jt = torch_utils.compute_flat_grad(mu_t, policy_net.parameters(),
                                               filter_input_ids=filter_input_ids,
                                               create_graph=True)
            Jtv = (Jt * Variable(v)).sum()
            Jv = torch.autograd.grad(Jtv, t, retain_graph=True)[0]
            MJv = Variable(M * Jv.data)
            mu_MJv = (MJv * mu).sum()
            JTMJv = torch_utils.compute_flat_grad(mu_MJv, policy_net.parameters(),
                                                  filter_input_ids=filter_input_ids,
                                                  retain_graph=True).data
            JTMJv /= states.shape[0]
            if not policy_net.is_disc_action:
                std_index = info['std_index']
                JTMJv[std_index: std_index + M.shape[0]] += \
                    2 * v[std_index: std_index + M.shape[0]]
            return JTMJv + v * self.damping

        """directly compute Hessian*vector from KL"""

        def Fvp_direct(v):
            kl = policy_net.get_kl(Variable(states))
            kl = kl.mean()

            grads = torch.autograd.grad(kl, policy_net.parameters(),
                                        create_graph=True)
            flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

            kl_v = (flat_grad_kl * Variable(v)).sum()
            grads = torch.autograd.grad(kl_v, policy_net.parameters())
            flat_grad_grad_kl = torch.cat(
                [grad.contiguous().view(-1) for grad in grads]).data

            return flat_grad_grad_kl + v * self.damping

        Fvp = Fvp_fim if self.use_fim else Fvp_direct

        loss = get_loss()
        grads = torch.autograd.grad(loss, policy_net.parameters())
        loss_grad = torch.cat([grad.view(-1) for grad in grads]).data
        stepdir = conjugate_gradients(Fvp, -loss_grad, 10)

        shs = (stepdir.dot(Fvp(stepdir)))
        lm = np.sqrt(2 * self.max_kl / (shs + 1e-8))
        if np.isnan(lm):
            lm = 1.
        fullstep = stepdir * lm
        expected_improve = -loss_grad.dot(fullstep)

        prev_params = torch_utils.get_flat_params_from(policy_net)
        success, new_params = \
            line_search(policy_net, get_loss, prev_params, fullstep, expected_improve, get_kl_diff, self.max_kl)
        logger.record_tabular('TRPO_linesearch_success', int(success))
        logger.record_tabular("KL_diff", get_kl_diff(prev_params,new_params))
        torch_utils.set_flat_params_to(policy_net, new_params)
        logger.log("old_parameters" + str(prev_params.detach().numpy()))
        logger.log("new_parameters" + str(new_params.detach().numpy()))
        return success