示例#1
0
def policyCrossEval(log_dir,
                    task,
                    episode,
                    model_path,
                    num_timesteps=2000,
                    num_cpu=1,
                    seed=0):
    train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(
        log_dir)
    env_kwargs = EnvsKwargs(task, env_kwargs)

    OK = True
    if (not OK):
        # no latest model saved yet
        return None, False
    else:
        pass
    printGreen(
        "Evaluation from the model saved at: {}, with evaluation time steps: {}"
        .format(model_path, num_timesteps))

    log_dir, environment, algo_args = createEnv(log_dir,
                                                train_args,
                                                algo_name,
                                                algo_class,
                                                env_kwargs,
                                                num_cpu=num_cpu,
                                                seed=seed)

    reward = policyEval(environment, model_path, log_dir, algo_class,
                        algo_args, num_timesteps, num_cpu)

    # Just a trick to save the episode number of the reward,but need a little bit more space to store
    reward = np.append(episode, reward)
    return reward, True
示例#2
0
    def run(self):
        for step in reversed(range(self.max_steps + 1)):
            max_n_param_sampled = int(
                math.ceil(self.budget / self.max_iter * self.eta**step /
                          (step + 1)))
            max_iters = self.max_iter * self.eta**(-step)

            all_parameters = np.array(
                [self.param_sampler() for _ in range(max_n_param_sampled)])
            for i in range(step + 1):
                printGreen("\npop_itt:{}/{}, itt:{}/{}, pop_size:{}".format(
                    self.max_steps - step, self.max_steps + 1, i, step + 1,
                    len(all_parameters)))
                n_param_sampled = int(
                    math.floor(max_n_param_sampled * self.eta**(-i)))
                num_iters = max_iters * self.eta**i
                losses = [
                    self.train(params, num_iters, train_id)
                    for train_id, params in enumerate(all_parameters)
                ]

                self.history.extend(
                    zip([(params, num_iters) for params in all_parameters],
                        losses))
                all_parameters = all_parameters[np.argsort(
                    losses)[:int(math.floor(n_param_sampled / self.eta))]]

        return self.history[int(np.argmin([val[1] for val in self.history]))]
示例#3
0
文件: train.py 项目: s206283/gcrl
def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    global win, win_smooth, win_episodes, n_steps, viz, params_saved, best_mean_reward
    # Create vizdom object only if needed
    if viz is None:
        viz = Visdom(port=VISDOM_PORT)

    is_es = registered_rl[ALGO_NAME][1] == AlgoType.EVOLUTION_STRATEGIES

    # Save RL agent parameters
    if not params_saved:
        # Filter locals
        params = filterJSONSerializableObjects(_locals)
        with open(LOG_DIR + "rl_locals.json", "w") as f:
            json.dump(params, f)
        params_saved = True

    # Save the RL model if it has improved
    if (n_steps + 1) % SAVE_INTERVAL == 0:
        # Evaluate network performance
        ok, mean_reward = computeMeanReward(LOG_DIR, N_EPISODES_EVAL, is_es=is_es, return_n_episodes=True)
        if ok:
            # Unpack mean reward and number of episodes
            mean_reward, n_episodes = mean_reward
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))
        else:
            # Not enough episode
            mean_reward = -10000
            n_episodes = 0

        # Save Best model
        if mean_reward > best_mean_reward and n_episodes >= MIN_EPISODES_BEFORE_SAVE:
            # Try saving the running average (only valid for mlp policy)
            try:
                if 'env' in _locals:
                    _locals['env'].save_running_average(LOG_DIR)
                else:
                    _locals['self'].env.save_running_average(LOG_DIR)
            except AttributeError:
                pass

            best_mean_reward = mean_reward
            printGreen("Saving new best model")
            ALGO.save(LOG_DIR + ALGO_NAME + "_model.pkl", _locals)

    # Plots in visdom
    if viz and (n_steps + 1) % LOG_INTERVAL == 0:
        win = timestepsPlot(viz, win, LOG_DIR, ENV_NAME, ALGO_NAME, bin_size=1, smooth=0, title=PLOT_TITLE, is_es=is_es)
        win_smooth = timestepsPlot(viz, win_smooth, LOG_DIR, ENV_NAME, ALGO_NAME, title=PLOT_TITLE + " smoothed",
                                   is_es=is_es)
        win_episodes = episodePlot(viz, win_episodes, LOG_DIR, ENV_NAME, ALGO_NAME, window=EPISODE_WINDOW,
                                   title=PLOT_TITLE + " [Episodes]", is_es=is_es)
    n_steps += 1
    return True
示例#4
0
def loadRunningAverage(envs, load_path_normalise=None):
    if load_path_normalise is not None:
        try:
            printGreen("Loading saved running average")
            envs.load_running_average(load_path_normalise)
            envs.training = False
        except FileNotFoundError:
            envs.training = True
            printYellow("Running Average files not found for VecNormalize, switching to training mode")
    return envs
示例#5
0
    def load_weight(self):
        """
        new function that copy the value and the structure from self.params
        :return:
        """
        #Creation of a new variable to the class PPO2

        pretrained_weight = [self.sess.run(var) for var in self.params]
        printGreen("Pretrained weight loaded")
        return pretrained_weight
def loadConfigAndSetup(load_args):
    """
    Get the training config and setup the parameters
    :param load_args: (Arguments)
    :return: (dict, str, str, str, dict)
    """
    algo_name = ""
    for algo in list(registered_rl.keys()):
        if algo in load_args.log_dir:
            algo_name = algo
            break
    algo_class, algo_type, _ = registered_rl[algo_name]
    if algo_type == AlgoType.OTHER:
        raise ValueError(algo_name + " is not supported for replay")
    printGreen("\n" + algo_name + "\n")

    load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name)

    env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r'))
    train_args = json.load(open(load_args.log_dir + "args.json", 'r'))

    env_kwargs = {
        "renders": load_args.render,
        "shape_reward": load_args.shape_reward,  # Reward sparse or shaped
        "action_joints": train_args["action_joints"],
        "is_discrete": not train_args["continuous_actions"],
        "random_target": train_args.get('random_target', False),
        "srl_model": train_args["srl_model"]
    }

    # load it, if it was defined
    if "action_repeat" in env_globals:
        env_kwargs["action_repeat"] = env_globals['action_repeat']

    # Remove up action
    if train_args["env"] == "Kuka2ButtonGymEnv-v0":
        env_kwargs["force_down"] = env_globals.get('force_down', True)
    else:
        env_kwargs["force_down"] = env_globals.get('force_down', False)

    srl_model_path = None
    if train_args["srl_model"] != "raw_pixels":
        train_args["policy"] = "mlp"
        path = env_globals.get('srl_model_path')

        if path is not None:
            env_kwargs["use_srl"] = True
            # Check that the srl saved model exists on the disk
            assert os.path.isfile(
                env_globals['srl_model_path']), "{} does not exist".format(
                    env_globals['srl_model_path'])
            srl_model_path = env_globals['srl_model_path']
            env_kwargs["srl_model_path"] = srl_model_path

    return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
示例#7
0
def comparePlots(path,
                 algo,
                 y_limits,
                 title="Learning Curve",
                 timesteps=False,
                 truncate_x=-1,
                 no_display=False,
                 normalization=False):
    """
    :param path: (str) path to the folder where the plots are stored
    :param plots: ([str]) List of saved plots as npz file
    :param y_limits: ([float]) y-limits for the plot
    :param title: (str) plot title
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """

    folders = []
    other = []
    legends = []
    for folder in os.listdir(path):
        folders_srl = []
        other_srl = []
        tmp_path = "{}/{}/{}/".format(path, folder, algo)
        legends.append(folder)
        for f in os.listdir(tmp_path):
            paths = "{}/{}/{}/{}/".format(path, folder, algo, f)
            env_globals = json.load(open(paths + "env_globals.json", 'r'))
            train_args = json.load(open(paths + "args.json", 'r'))
            if train_args["shape_reward"] == args.shape_reward:
                folders_srl.append(paths)
            else:
                other_srl.append(paths)
        folders.append(folders_srl)
        other.append(other_srl)

    x_list, y_list = [], []
    for folders_srl in folders:
        printGreen("Folder name {}".format(folders_srl))
        x, y = GatherExperiments(folders_srl,
                                 algo,
                                 window=40,
                                 title=title,
                                 min_num_x=-1,
                                 timesteps=timesteps,
                                 output_file="")
        print(len(x))
        x_list.append(x)
        y_list.append(y)
    printGreen(np.array(x_list).shape)
    # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape))

    plotGatheredData(x_list, y_list, y_limits, timesteps, title, legends,
                     no_display, truncate_x, normalization)
示例#8
0
def plotGatheredData(x_list,y_list,y_limits, timesteps,title,legends,no_display,truncate_x=-1,normalization=False):
    assert len(legends)==len(y_list)
    printGreen("{} Experiments".format(len(y_list)))

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)
    if truncate_x > 0:
        min_x = min(truncate_x, min_x)
    x = np.array(x_list[0][:min_x])
    #To reformulize the data by the min_x
    for i in range(len(y_list)):
        y_list[i]=y_list[i][:, :min_x]
    y_list=np.array(y_list)

    #print("Min, Max rewards:", np.min(y_list), np.max(y_list))


    #Normalize the data between 0 and 1.
    if (normalization):
        y_limits = [-0.05, 1.05]
        y_list   =(y_list-np.min(y_list))/(np.max(y_list)-np.min(y_list))

    fig = plt.figure(title)
    for i in range(len(y_list)):
        label = legends[i]
        y = y_list[i][:, :min_x]

        print('{}: {} experiments'.format(label, len(y)))
        # Compute mean for different seeds
        m = np.mean(y, axis=0)
        # Compute standard error
        s = np.squeeze(np.asarray(np.std(y, axis=0)))
        n = y.shape[0]
        plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[i % len(lightcolors)], alpha=0.5)
        plt.plot(x, m, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2)

    if timesteps:
        formatter = FuncFormatter(millions)
        plt.xlabel('Number of Timesteps')
        fig.axes[0].xaxis.set_major_formatter(formatter)
    else:
        plt.xlabel('Number of Episodes')
    if(normalization):
        plt.ylabel('Normalized Rewards')
    else:
        plt.ylabel('Rewards')
    plt.title(title, **fontstyle)
    plt.ylim(y_limits)

    plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=16)

    if not no_display:
        plt.show()
    def _train(params, num_iters=None, train_id=None):
        # generate a print string
        print_str = "\nID_num={}, "
        format_args = []
        if train_id is None:
            if not hasattr(_train, "current_id"):
                _train.current_id = 0
            train_id = _train.current_id
            _train.current_id += 1
        format_args.append(train_id)
        if num_iters is not None:
            print_str += "Num-timesteps={}, "
            format_args.append(int(max(MIN_ITERATION, num_iters * ITERATION_SCALE)))

        print_str += "Param:"
        printGreen(print_str.format(*format_args))
        pprint.pprint(params)

        # cleanup old files
        if os.path.exists(args.log_dir):
            shutil.rmtree(args.log_dir)

        # add the training args that where parsed for the hyperparam optimizers
        if num_iters is not None:
            loop_args = ['--num-timesteps', str(int(max(MIN_ITERATION, num_iters * ITERATION_SCALE)))]
        else:
            loop_args = ['--num-timesteps', str(int(args.num_timesteps))]

        # redefine the hyperparam args for rl_baselines.train
        if len(params) > 0:
            loop_args.append("--hyperparam")
            for param_name, param_val in params.items():
                loop_args.append("{}:{}".format(param_name, param_val))

        # call the training
        ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + train_args + loop_args, stdout=stdout)
        if ok != 0:
            # throw the error down to the terminal
            raise ChildProcessError("An error occured, error code: {}".format(ok))

        # load the logging of the training, and extract the reward
        folders = glob.glob("{}/{}/{}/{}/*".format(args.log_dir, args.env, args.srl_model, args.algo))
        assert len(folders) != 0, "Error: Could not find generated directory, halting {} search.".format(args.optimizer)
        rewards = []
        for monitor_path in glob.glob(folders[0] + "/*.monitor.csv"):
            rewards.append(np.mean(pd.read_csv(monitor_path, skiprows=1)["r"][-10:]))
        if np.isnan(rewards).any():
            rewards = -np.inf
        print("reward: ", np.mean(rewards))

        # negative reward, as we are minimizing with hyperparameter search
        return -np.mean(rewards)
示例#10
0
def policyCrossEval(log_dir,
                    task,
                    episode,
                    model_path,
                    num_timesteps=2000,
                    num_cpu=1):
    """
    To do a cross evaluation for a certain policy for different tasks
    A version of real time evaluation but with some bugs to fix
    :param log_dir:
    :param task:
    :param episode:
    :param model_path:
    :param num_timesteps: How many timesteps to evaluate the policy
    :param num_cpu:
    :return:
    """
    train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(
        log_dir)
    env_kwargs = EnvsKwargs(task, env_kwargs)

    OK = True
    if (not OK):
        # no latest model saved yet
        return None, False
    else:
        pass
    printGreen(
        "Evaluation from the model saved at: {}, with evaluation time steps: {}"
        .format(model_path, num_timesteps))

    log_dir, environment, algo_args = createEnv(log_dir,
                                                train_args,
                                                algo_name,
                                                algo_class,
                                                env_kwargs,
                                                num_cpu=num_cpu)

    reward = policyEval(environment, model_path, log_dir, algo_class,
                        algo_args, num_timesteps, num_cpu)

    # Just a trick to save the episode number of the reward,but need a little bit more space to store
    reward = np.append(episode, reward)
    return reward, True
def makeTable(input_dir, 
            rl_algo_name="ppo2", 
            checkpoints=[1e6, 2*1e6, 3*1e6, 4*1e6, 5*1e6], 
            episode_len=100, 
            caption="my-caption",
            filepath=None):
    """

    ---------- Latex table example -----------
    \begin{table}[h!]
    \centering
    \begin{tabular}{c|ccc} % c: center, l: left
    \hline
    0 & 0 & 0 & 1 \\ \hline
    1 & 1 & 1 & 2 \\
    2 & 2 & 2 & 3 \\ \hline
    \end{tabular}
    \caption{}
    \label{tab:my-table}
    \end{table}
    -----------------------------------------
    """
    ext = filepath.split(".")[-1]
    assert ext in ["tex", "md"], "Only support Latex (tex) or Markdown (md) extension"
    if ext == "md":
        raise NotImplementedError
    srl_algo_dirs = glob.glob(os.path.join(input_dir, "*")) # list of subfolder in input_dir
    table = defaultdict(lambda: [])
    for folder in srl_algo_dirs:
        srl_name = folder.split("/")[-1]
        srl_algo_exps = glob.glob(os.path.join(os.path.join(folder, rl_algo_name, "*")))
        printGreen("Found srl model: {} with {} experiments.".format(srl_name.ljust(20), len(srl_algo_exps)))
        for exp_dir in srl_algo_exps:
            _, rewards_history, total_timesteps = loadEpisodesData(exp_dir)
            mean_rwd = meanEpisodesReward(rewards_history, total_timesteps, checkpoints=checkpoints, episode_len=episode_len)
            table[srl_name].append(mean_rwd)

    with open(filepath, "w") as file:
        file.writelines("\\begin{table}[h!]\n")
        file.writelines("\\centering\n")
        file.writelines("\\begin{tabular}{c|"+len(checkpoints)*"c"+"}\n")
        file.writelines("\\hline\n")
        x_axis = timesteps2str(checkpoints)
        file.writelines("   & {} \\\\ \\hline \n".format(" & ".join(x_axis)))
        srl_names_list = sorted(list(table.keys()))
        if "ground_truth" in srl_names_list:
            ## put ground truth on the top of table
            srl_names_list.remove("ground_truth")
            srl_names_list.insert(0, "ground_truth") 
        for ind, srl_name in enumerate(srl_names_list):
            res = results2str_latex(table[srl_name])
            if ind == len(table) - 1 or srl_name == "ground_truth":
                file.writelines("{} & {} \\\\ \\hline \n".format(processStrLatex(srl_name), " & ".join(res)))
            else:
                file.writelines("{} & {} \\\\ \n".format(processStrLatex(srl_name), " & ".join(res)))

        file.writelines("\\end{tabular}\n")
        file.writelines("\\caption{{{}}}\n".format(caption))
        file.writelines("\\end{table}\n")

        

    return table
示例#12
0
def comparePlots(path,
                 plots,
                 y_limits,
                 title="Learning Curve",
                 timesteps=False,
                 truncate_x=-1,
                 no_display=False):
    """
    :param path: (str) path to the folder where the plots are stored
    :param plots: ([str]) List of saved plots as npz file
    :param y_limits: ([float]) y-limits for the plot
    :param title: (str) plot title
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    y_list = []
    x_list = []
    for plot in plots:
        saved_plot = np.load('{}/{}'.format(path, plot))
        x_list.append(saved_plot['x'])
        y_list.append(saved_plot['y'])

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)

    print("Min x: {}".format(min_x))
    print("Max x: {}".format(max_x))

    if truncate_x > 0:
        min_x = min(truncate_x, min_x)
    print("Truncating the x-axis at {}".format(min_x))

    x = np.array(x_list[0][:min_x])

    printGreen("{} Experiments".format(len(y_list)))
    # print("Min, Max rewards:", np.min(y), np.max(y))

    fig = plt.figure(title)
    for i in range(len(y_list)):
        label = plots[i].split('.npz')[0]
        y = y_list[i][:, :min_x]
        print('{}: {} experiments'.format(label, len(y)))
        # Compute mean for different seeds
        m = np.mean(y, axis=0)
        # Compute standard error
        s = np.squeeze(np.asarray(np.std(y, axis=0)))
        n = y.shape[0]
        plt.fill_between(x,
                         m - s / np.sqrt(n),
                         m + s / np.sqrt(n),
                         color=lightcolors[i % len(lightcolors)],
                         alpha=0.5)
        plt.plot(x,
                 m,
                 color=darkcolors[i % len(darkcolors)],
                 label=label,
                 linewidth=2)

    if timesteps:
        formatter = FuncFormatter(millions)
        plt.xlabel('Number of Timesteps', fontsize=20, fontweight='bold')
        fig.axes[0].xaxis.set_major_formatter(formatter)
    else:
        plt.xlabel('Number of Episodes')
    plt.ylabel('Rewards', fontsize=20, fontweight='bold')

    plt.title(title, **fontstyle)
    plt.ylim(y_limits)

    plt.legend(framealpha=0.8,
               frameon=True,
               labelspacing=0.01,
               loc='lower right',
               fontsize=18)

    if not no_display:
        plt.show()
示例#13
0
        index_to_begin = episodes.astype(int).tolist().index(max_eps) + 1

    else:
        task_labels = ['cc', 'sc']
        rewards = {}
        rewards['episode'] = []
        rewards['policy'] = []
        for t in ['cc', 'sc']:
            rewards[t] = []

    for policy_path in policy_paths[index_to_begin:]:
        copyfile(log_dir + '/args.json', policy_path + '/args.json')
        copyfile(log_dir + '/env_globals.json',
                 policy_path + '/env_globals.json')

    printGreen("The evaluation will begin from {}".format(
        episodes[index_to_begin]))

    last_mean = [250., 1900.]
    run_mean = [0, 0]

    for k in range(index_to_begin, len(episodes), interval_len):
        # if(interval_len > 1 and int(episodes[k])>=episode_schedule):
        #     k += interval_len-1
        printGreen("Evaluation for episode: {}".format(episodes[k]))
        increase_interval = True

        model_path = policy_paths[k]

        for t, task_label in enumerate(["-sc", "-cc"]):

            local_reward = [int(episodes[k])]
示例#14
0
def loadConfigAndSetup(load_args):
    """
    Get the training config and setup the parameters
    :param load_args: (Arguments)
    :return: (dict, str, str, str, dict)
    """
    algo_name = ""
    for algo in list(registered_rl.keys()):
        if algo in load_args.log_dir:
            algo_name = algo
            break
    algo_class, algo_type, _ = registered_rl[algo_name]
    if algo_type == AlgoType.OTHER:
        raise ValueError(algo_name + " is not supported for replay")
    printGreen("\n" + algo_name + "\n")

    try:  # If args contains episode information, this is for student_evaluation (distillation)
        if not load_args.episode == -1:
            load_path = "{}/{}_{}_model.pkl".format(load_args.log_dir, algo_name, load_args.episode,)
        else:
            load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name)
    except:
        printYellow(
            "No episode of checkpoint specified, go for the default policy model: {}_model.pkl".format(algo_name))
        if load_args.log_dir[-3:] != 'pkl':
            load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name)
        else:
            load_path = load_args.log_dir
            load_args.log_dir = os.path.dirname(load_path)+'/'

    env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r'))
    train_args = json.load(open(load_args.log_dir + "args.json", 'r'))

    env_kwargs = {
        "renders": load_args.render,
        "shape_reward": load_args.shape_reward,  # Reward sparse or shaped
        "action_joints": train_args["action_joints"],
        "is_discrete": not train_args["continuous_actions"],
        "random_target": train_args.get('random_target', False),
        "srl_model": train_args["srl_model"]
    }

    # load it, if it was defined
    if "action_repeat" in env_globals:
        env_kwargs["action_repeat"] = env_globals['action_repeat']

    # Remove up action
    if train_args["env"] == "Kuka2ButtonGymEnv-v0":
        env_kwargs["force_down"] = env_globals.get('force_down', True)
    else:
        env_kwargs["force_down"] = env_globals.get('force_down', False)

    if train_args["env"] == "OmnirobotEnv-v0":
        env_kwargs["simple_continual_target"] = env_globals.get("simple_continual_target", False)
        env_kwargs["circular_continual_move"] = env_globals.get("circular_continual_move", False)
        env_kwargs["square_continual_move"] = env_globals.get("square_continual_move", False)
        env_kwargs["eight_continual_move"] = env_globals.get("eight_continual_move", False)

        # If overriding the environment for specific Continual Learning tasks
        if sum([load_args.simple_continual, load_args.circular_continual, load_args.square_continual]) >= 1:
            env_kwargs["simple_continual_target"] = load_args.simple_continual
            env_kwargs["circular_continual_move"] = load_args.circular_continual
            env_kwargs["square_continual_move"] = load_args.square_continual
            env_kwargs["random_target"] = not (load_args.circular_continual or load_args.square_continual)

    srl_model_path = None
    if train_args["srl_model"] != "raw_pixels":
        train_args["policy"] = "mlp"
        path = env_globals.get('srl_model_path')

        if path is not None:
            env_kwargs["use_srl"] = True
            # Check that the srl saved model exists on the disk
            assert os.path.isfile(env_globals['srl_model_path']), \
                "{} does not exist".format(env_globals['srl_model_path'])
            srl_model_path = env_globals['srl_model_path']
            env_kwargs["srl_model_path"] = srl_model_path

    return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
示例#15
0
def main():
    # Global variables for callback
    global ENV_NAME, ALGO, ALGO_NAME, LOG_INTERVAL, VISDOM_PORT, viz
    global SAVE_INTERVAL, EPISODE_WINDOW, MIN_EPISODES_BEFORE_SAVE
    parser = argparse.ArgumentParser(
        description="Train script for RL algorithms")
    parser.add_argument('--algo',
                        default='ppo2',
                        choices=list(registered_rl.keys()),
                        help='RL algo to use',
                        type=str)
    parser.add_argument('--env',
                        type=str,
                        help='environment ID',
                        default='KukaButtonGymEnv-v0',
                        choices=list(registered_env.keys()))
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='random seed (default: 0)')
    parser.add_argument(
        '--episode-window',
        type=int,
        default=40,
        help='Episode window for moving average plot (default: 40)')
    parser.add_argument(
        '--log-dir',
        default='/tmp/gym/',
        type=str,
        help='directory to save agent logs and model (default: /tmp/gym)')
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--srl-model',
                        type=str,
                        default='raw_pixels',
                        choices=list(registered_srl.keys()),
                        help='SRL model to use')
    parser.add_argument('--num-stack',
                        type=int,
                        default=1,
                        help='number of frames to stack (default: 1)')
    parser.add_argument(
        '--action-repeat',
        type=int,
        default=1,
        help='number of times an action will be repeated (default: 1)')
    parser.add_argument('--port',
                        type=int,
                        default=8097,
                        help='visdom server port (default: 8097)')
    parser.add_argument('--no-vis',
                        action='store_true',
                        default=False,
                        help='disables visdom visualization')
    parser.add_argument(
        '--shape-reward',
        action='store_true',
        default=False,
        help='Shape the reward (reward = - distance) instead of a sparse reward'
    )
    parser.add_argument('-c',
                        '--continuous-actions',
                        action='store_true',
                        default=False)
    parser.add_argument(
        '-joints',
        '--action-joints',
        action='store_true',
        default=False,
        help=
        'set actions to the joints of the arm directly, instead of inverse kinematics'
    )
    parser.add_argument('-r',
                        '--random-target',
                        action='store_true',
                        default=False,
                        help='Set the button to a random position')
    parser.add_argument(
        '--srl-config-file',
        type=str,
        default="config/srl_models.yaml",
        help='Set the location of the SRL model path configuration.')
    parser.add_argument('--hyperparam', type=str, nargs='+', default=[])
    parser.add_argument('--min-episodes-save',
                        type=int,
                        default=100,
                        help="Min number of episodes before saving best model")
    parser.add_argument(
        '--latest',
        action='store_true',
        default=False,
        help=
        'load the latest learned model (location:srl_zoo/logs/DatasetName/)')
    parser.add_argument(
        '--load-rl-model-path',
        type=str,
        default=None,
        help="load the trained RL model, should be with the same algorithm type"
    )
    parser.add_argument(
        '-sc',
        '--simple-continual',
        action='store_true',
        default=False,
        help=
        'Simple red square target for task 1 of continual learning scenario. '
        + 'The task is: robot should reach the target.')
    parser.add_argument(
        '-cc',
        '--circular-continual',
        action='store_true',
        default=False,
        help='Blue square target for task 2 of continual learning scenario. ' +
        'The task is: robot should turn in circle around the target.')
    parser.add_argument(
        '-sqc',
        '--square-continual',
        action='store_true',
        default=False,
        help='Green square target for task 3 of continual learning scenario. '
        + 'The task is: robot should turn in square around the target.')
    parser.add_argument(
        '-ec',
        '--eight-continual',
        action='store_true',
        default=False,
        help='Green square target for task 4 of continual learning scenario. '
        +
        'The task is: robot should do the eigth with the target as center of the shape.'
    )
    parser.add_argument('--teacher-data-folder',
                        type=str,
                        default="",
                        help='Dataset folder of the teacher(s) policy(ies)',
                        required=False)
    parser.add_argument(
        '--epochs-distillation',
        type=int,
        default=30,
        metavar='N',
        help='number of epochs to train for distillation(default: 30)')
    parser.add_argument(
        '--distillation-training-set-size',
        type=int,
        default=-1,
        help='Limit size (number of samples) of the training set (default: -1)'
    )
    parser.add_argument(
        '--perform-cross-evaluation-cc',
        action='store_true',
        default=False,
        help='A cross evaluation from the latest stored model to all tasks')
    parser.add_argument(
        '--eval-episode-window',
        type=int,
        default=400,
        metavar='N',
        help=
        'Episode window for saving each policy checkpoint for future distillation(default: 100)'
    )
    parser.add_argument(
        '--new-lr',
        type=float,
        default=1.e-4,
        help="New learning rate ratio to train a pretrained agent")
    parser.add_argument('--img-shape',
                        type=str,
                        default="(3,64,64)",
                        help="Image shape of environment.")
    parser.add_argument(
        "--gpu-num",
        help="Choose the number of GPU (CUDA_VISIBLE_DEVICES).",
        type=str,
        default="1",
        choices=["0", "1", "2", "3", "5", "6", "7", "8"])
    parser.add_argument("--srl-model-path",
                        help="SRL model weights path",
                        type=str,
                        default=None)
    parser.add_argument(
        "--relative-pos",
        action='store_true',
        default=False,
        help="For 'ground_truth': use relative position or not.")
    # Ignore unknown args for now
    args, unknown = parser.parse_known_args()
    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num
    env_kwargs = {}
    if args.img_shape is None:
        img_shape = None  #(3,224,224)
    else:
        img_shape = tuple(map(int, args.img_shape[1:-1].split(",")))
    env_kwargs['img_shape'] = img_shape
    # LOAD SRL models list
    assert os.path.exists(args.srl_config_file), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file)
    with open(args.srl_config_file, 'rb') as f:
        all_models = yaml.load(f)
    # Sanity check
    assert args.episode_window >= 1, "Error: --episode_window cannot be less than 1"
    assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1"
    assert args.num_stack >= 1, "Error: --num-stack cannot be less than 1"
    assert args.action_repeat >= 1, "Error: --action-repeat cannot be less than 1"
    assert 0 <= args.port < 65535, "Error: invalid visdom port number {}, ".format(args.port) + \
                                   "port number must be an unsigned 16bit number [0,65535]."
    assert registered_srl[args.srl_model][0] == SRLType.ENVIRONMENT or args.env in all_models, \
        "Error: the environment {} has no srl_model defined in 'srl_models.yaml'. Cannot continue.".format(args.env)
    # check that all the SRL_model can be run on the environment
    if registered_srl[args.srl_model][1] is not None:
        found = False
        for compatible_class in registered_srl[args.srl_model][1]:
            if issubclass(compatible_class, registered_env[args.env][0]):
                found = True
                break
        assert found, "Error: srl_model {}, is not compatible with the {} environment.".format(
            args.srl_model, args.env)

    assert not(sum([args.simple_continual, args.circular_continual, args.square_continual, args.eight_continual]) \
           > 1 and args.env == "OmnirobotEnv-v0"), \
        "For continual SRL and RL, please provide only one scenario at the time and use OmnirobotEnv-v0 environment !"

    assert not(args.algo == "distillation" and (args.teacher_data_folder == '' or args.continuous_actions is True)), \
        "For performing policy distillation, make sure use specify a valid teacher dataset and discrete actions !"

    ENV_NAME = args.env
    ALGO_NAME = args.algo
    VISDOM_PORT = args.port
    EPISODE_WINDOW = args.episode_window
    MIN_EPISODES_BEFORE_SAVE = args.min_episodes_save
    CROSS_EVAL = args.perform_cross_evaluation_cc
    EPISODE_WINDOW_DISTILLATION_WIN = args.eval_episode_window
    NEW_LR = args.new_lr
    print("EPISODE_WINDOW_DISTILLATION_WIN: ", EPISODE_WINDOW_DISTILLATION_WIN)

    if args.no_vis:
        viz = False

    algo_class, algo_type, action_type = registered_rl[args.algo]
    algo = algo_class()
    ALGO = algo

    # if callback frequency needs to be changed
    LOG_INTERVAL = algo.LOG_INTERVAL
    SAVE_INTERVAL = algo.SAVE_INTERVAL

    if not args.continuous_actions and ActionType.DISCRETE not in action_type:
        raise ValueError(
            args.algo +
            " does not support discrete actions, please use the '--continuous-actions' "
            + "(or '-c') flag.")
    if args.continuous_actions and ActionType.CONTINUOUS not in action_type:
        raise ValueError(
            args.algo +
            " does not support continuous actions, please remove the '--continuous-actions' "
            + "(or '-c') flag.")

    env_kwargs["is_discrete"] = not args.continuous_actions

    printGreen("\nAgent = {} \n".format(args.algo))

    env_kwargs["action_repeat"] = args.action_repeat
    # Random init position for button
    env_kwargs["random_target"] = args.random_target

    # If in simple continual scenario, then the target should be initialized randomly.
    if args.simple_continual is True:
        env_kwargs["random_target"] = True

    # Allow up action
    # env_kwargs["force_down"] = False

    # allow multi-view
    env_kwargs['multi_view'] = args.srl_model == "multi_view_srl"
    parser = algo.customArguments(parser)
    args = parser.parse_args()

    args, env_kwargs = configureEnvAndLogFolder(args, env_kwargs, all_models)
    args_dict = filterJSONSerializableObjects(vars(args))
    # Save args
    with open(LOG_DIR + "args.json", "w") as f:
        json.dump(args_dict, f)

    env_class = registered_env[args.env][0]
    # env default kwargs
    default_env_kwargs = {
        k: v.default
        for k, v in inspect.signature(env_class.__init__).parameters.items()
        if v is not None
    }

    globals_env_param = sys.modules[env_class.__module__].getGlobals()
    ### HACK way to reset image shape !!
    globals_env_param['RENDER_HEIGHT'] = img_shape[1]
    globals_env_param['RENDER_WIDTH'] = img_shape[2]
    globals_env_param['RELATIVE_POS'] = args.relative_pos

    super_class = registered_env[args.env][1]
    # reccursive search through all the super classes of the asked environment, in order to get all the arguments.
    rec_super_class_lookup = {
        dict_class: dict_super_class
        for _, (dict_class, dict_super_class, _, _) in registered_env.items()
    }
    while super_class != SRLGymEnv:
        assert super_class in rec_super_class_lookup, "Error: could not find super class of {}".format(super_class) + \
                                                      ", are you sure \"registered_env\" is correctly defined?"
        super_env_kwargs = {
            k: v.default
            for k, v in inspect.signature(
                super_class.__init__).parameters.items() if v is not None
        }
        default_env_kwargs = {**super_env_kwargs, **default_env_kwargs}

        globals_env_param = {
            **sys.modules[super_class.__module__].getGlobals(),
            **globals_env_param
        }

        super_class = rec_super_class_lookup[super_class]

    # Print Variables
    printYellow("Arguments:")
    pprint(args_dict)
    printYellow("Env Globals:")
    pprint(
        filterJSONSerializableObjects({
            **globals_env_param,
            **default_env_kwargs,
            **env_kwargs
        }))
    # Save env params
    saveEnvParams(globals_env_param, {**default_env_kwargs, **env_kwargs})
    # Seed tensorflow, python and numpy random generator
    set_global_seeds(args.seed)
    # Augment the number of timesteps (when using mutliprocessing this number is not reached)
    args.num_timesteps = int(1.1 * args.num_timesteps)
    # Get the hyperparameter, if given (Hyperband)
    hyperparams = {
        param.split(":")[0]: param.split(":")[1]
        for param in args.hyperparam
    }
    hyperparams = algo.parserHyperParam(hyperparams)

    if args.load_rl_model_path is not None:
        #use a small learning rate
        print("use a small learning rate: {:f}".format(1.0e-4))
        hyperparams["learning_rate"] = lambda f: f * 1.0e-4

    # Train the agent
    if args.load_rl_model_path is not None:
        algo.setLoadPath(args.load_rl_model_path)
    algo.train(args, callback, env_kwargs=env_kwargs, train_kwargs=hyperparams)
示例#16
0
def main():
    parser = argparse.ArgumentParser(
        description="OpenAI RL Baselines Benchmark",
        epilog=
        'After the arguments are parsed, the rest are assumed to be arguments for'
        + ' rl_baselines.train')
    parser.add_argument('--algo',
                        type=str,
                        default='ppo2',
                        help='OpenAI baseline to use',
                        choices=list(registered_rl.keys()))
    parser.add_argument('--env',
                        type=str,
                        nargs='+',
                        default=["KukaButtonGymEnv-v0"],
                        help='environment ID(s)',
                        choices=list(registered_env.keys()))
    parser.add_argument('--srl-model',
                        type=str,
                        nargs='+',
                        default=["raw_pixels"],
                        help='SRL model(s) to use',
                        choices=list(registered_srl.keys()))
    parser.add_argument('--num-timesteps',
                        type=int,
                        default=1e6,
                        help='number of timesteps the baseline should run')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='Display baseline STDOUT')
    parser.add_argument(
        '--num-iteration',
        type=int,
        default=15,
        help=
        'number of time each algorithm should be run for each unique combination of environment '
        + ' and srl-model.')
    parser.add_argument(
        '--seed',
        type=int,
        default=0,
        help=
        'initial seed for each unique combination of environment and srl-model.'
    )
    parser.add_argument(
        '--srl-config-file',
        type=str,
        default="config/srl_models.yaml",
        help='Set the location of the SRL model path configuration.')

    # returns the parsed arguments, and the rest are assumed to be arguments for rl_baselines.train
    args, train_args = parser.parse_known_args()

    # Sanity check
    assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1"
    assert args.num_iteration >= 1, "Error: --num-iteration cannot be less than 1"

    # Removing duplicates and sort
    srl_models = list(set(args.srl_model))
    envs = list(set(args.env))
    srl_models.sort()
    envs.sort()

    # LOAD SRL models list
    assert os.path.exists(args.srl_config_file), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file)
    with open(args.srl_config_file, 'rb') as f:
        all_models = yaml.load(f)

    # Checking definition and presence of all requested srl_models
    valid = True
    for env in envs:
        # validated the env definition
        if env not in all_models:
            printRed(
                "Error: 'srl_models.yaml' missing definition for environment {}"
                .format(env))
            valid = False
            continue  # skip to the next env, this one is not valid

        # checking log_folder for current env
        missing_log = "log_folder" not in all_models[env]
        if missing_log:
            printRed(
                "Error: 'srl_models.yaml' missing definition for log_folder in environment {}"
                .format(env))
            valid = False

        # validate each model for the current env definition
        for model in srl_models:
            if registered_srl[model][0] == SRLType.ENVIRONMENT:
                continue  # not an srl model, skip to the next model
            elif model not in all_models[env]:
                printRed(
                    "Error: 'srl_models.yaml' missing srl_model {} for environment {}"
                    .format(model, env))
                valid = False
            elif (not missing_log) and (
                    not os.path.exists(all_models[env]["log_folder"] +
                                       all_models[env][model])):
                # checking presence of srl_model path, if and only if log_folder exists
                printRed(
                    "Error: srl_model {} for environment {} was defined in ".
                    format(model, env) +
                    "'srl_models.yaml', however the file {} it was tagetting does not exist."
                    .format(all_models[env]["log_folder"] +
                            all_models[env][model]))
                valid = False

    assert valid, "Errors occured due to malformed 'srl_models.yaml', cannot continue."

    # check that all the SRL_models can be run on all the environments
    valid = True
    for env in envs:
        for model in srl_models:
            if registered_srl[model][1] is not None:
                found = False
                for compatible_class in registered_srl[model][1]:
                    if issubclass(compatible_class, registered_env[env][0]):
                        found = True
                        break
                if not found:
                    valid = False
                    printRed(
                        "Error: srl_model {}, is not compatible with the {} environment."
                        .format(model, env))
    assert valid, "Errors occured due to an incompatible combination of srl_model and environment, cannot continue."

    # the seeds used in training the baseline.
    seeds = list(np.arange(args.num_iteration) + args.seed)

    if args.verbose:
        # None here means stdout of terminal for subprocess.call
        stdout = None
    else:
        stdout = open(os.devnull, 'w')

    printGreen("\nRunning {} benchmarks {} times...".format(
        args.algo, args.num_iteration))
    print("\nSRL-Models:\t{}".format(srl_models))
    print("environments:\t{}".format(envs))
    print("verbose:\t{}".format(args.verbose))
    print("timesteps:\t{}".format(args.num_timesteps))
    for model in srl_models:
        for env in envs:
            for i in range(args.num_iteration):

                printGreen(
                    "\nIteration_num={} (seed: {}), Environment='{}', SRL-Model='{}'"
                    .format(i, seeds[i], env, model))

                # redefine the parsed args for rl_baselines.train
                loop_args = [
                    '--srl-model', model, '--seed',
                    str(seeds[i]), '--algo', args.algo, '--env', env,
                    '--num-timesteps',
                    str(int(args.num_timesteps)), '--srl-config-file',
                    args.srl_config_file
                ]

                ok = subprocess.call(['python', '-m', 'rl_baselines.train'] +
                                     train_args + loop_args,
                                     stdout=stdout)

                if ok != 0:
                    # throw the error down to the terminal
                    raise ChildProcessError(
                        "An error occured, error code: {}".format(ok))
示例#17
0
    def compute_fisher(self, num_timesteps, runner):
        """
        To get the diagonal of accumulated fisher information matrix
        :param num_timesteps: timesteps for the sampling
        :param runner:
        :return:
        """

        num_samples = num_timesteps // self.n_batch

        # Creation of a new variable to the class PPO2
        self.Fisher_accum = [
            np.zeros_like(var) for var in self.pretrained_weight
        ]

        F_prev = deepcopy(self.Fisher_accum)
        mean_diffs = np.zeros(0)
        for iter in range(1, num_samples + 1):
            obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run(
            )
            #randomly sample  from the action, value and q-value
            step_ind = np.random.randint(self.n_steps)
            action_ind = tf.to_int32(
                tf.random.categorical(tf.log(self.train_model.policy_proba),
                                      1))[:, 0]
            n_action = self.train_model.policy_proba.shape[1]
            action_mask = tf.one_hot(action_ind,
                                     depth=n_action,
                                     dtype=tf.bool,
                                     on_value=True,
                                     off_value=False)
            action_prob = tf.boolean_mask(self.train_model.policy_proba,
                                          action_mask)
            q_value = tf.boolean_mask(self.train_model.q_value, action_mask)
            #compute the fisher accumualated information
            for v in range(len(self.params)):
                #the first order derivative of the action proba by parameters (weight matrix)
                obs_sample = obs[step_ind:step_ind + 1]
                grad_action, grad_value, grad_q = self.sess.run(
                    [
                        tf.gradients(action_prob,
                                     self.params[v],
                                     unconnected_gradients='zero')[0],
                        tf.gradients(self.train_model._value,
                                     self.params[v],
                                     unconnected_gradients='zero')[0],
                        tf.gradients(q_value,
                                     self.params[v],
                                     unconnected_gradients='zero')[0]
                    ],
                    feed_dict={
                        self.train_model.obs_ph: obs_sample,
                        self.params[v]: self.pretrained_weight[v]
                    })
                """
                Add penalization only on the action space, or do the regularization on all outputs
                """
                #if (len(np.unique(grad_action)) >1):
                self.Fisher_accum[v] += np.square(
                    (grad_action + grad_value + grad_q))
            # Codes to show the convergence
            if (iter % (num_samples // 10) == 0):
                F_diff = 0
                Fisher_total = 0
                for v in range(len(self.Fisher_accum)):
                    F_diff += np.sum(
                        np.absolute(self.Fisher_accum[v] / (iter + 1) -
                                    F_prev[v]))
                    Fisher_total += np.sum(
                        np.absolute(self.Fisher_accum[v] / (iter + 1)))
                mean_diff = np.mean(F_diff)
                mean_diffs = np.append(mean_diffs, mean_diff)
                for v in range(len(self.Fisher_accum)):
                    F_prev[v] = self.Fisher_accum[v] / (iter + 1)
                printGreen(
                    "At iteration: {}, the new added information difference {}, total Fisher value {}"
                    .format(iter, F_diff, Fisher_total))

        printGreen("Fisher information computation complete")
        for v in range(len(self.Fisher_accum)):
            self.Fisher_accum[v] /= (num_samples)
示例#18
0
def comparePlots(path,
                 algo,
                 y_limits,
                 title="Learning Curve",
                 timesteps=False,
                 truncate_x=-1,
                 no_display=False,
                 normalization=False,
                 figpath=None,
                 exclude_list=None):
    """
    :param path: (str) path to the folder where the plots are stored
    :param plots: ([str]) List of saved plots as npz file
    :param y_limits: ([float]) y-limits for the plot
    :param title: (str) plot title
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    if exclude_list is None:
        exclude_list = []
    folders = []
    legends = []
    for folder in os.listdir(path):
        folders_srl = []
        tmp_path = "{}/{}/{}/".format(path, folder, algo)
        if os.path.exists(tmp_path) and (
                folder not in exclude_list
        ):  # folder contains algo (e.g. ppo2) subfolder and not in excluded list
            printRed(folder)
            legends.append(folder)
            for f in os.listdir(tmp_path):
                paths = "{}/{}/{}/{}/".format(path, folder, algo, f)
                folders_srl.append(paths)
            folders.append(folders_srl)
        else:
            continue

    x_list, y_list = [], []
    exp_name_dict = {}
    for ind, folders_srl in enumerate(folders):
        printGreen("Folder name {}".format(folders_srl))
        x, y = GatherExperiments(folders_srl,
                                 algo,
                                 window=40,
                                 title=title,
                                 min_num_x=-1,
                                 timesteps=timesteps,
                                 output_file="")
        print(len(x))
        x_list.append(x)
        y_list.append(y)
        ## HACK: the line below is ugly and not robust code !! TODO
        exp_name_dict[ind] = folders_srl[0].split("/")[-4]
    printGreen(np.array(x_list).shape)
    # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape))

    plotGatheredData(x_list,
                     y_list,
                     y_limits,
                     timesteps,
                     title,
                     legends,
                     no_display,
                     truncate_x,
                     normalization,
                     figpath=figpath,
                     exp_name_dict=exp_name_dict)
示例#19
0
def plotGatheredData(x_list,
                     y_list,
                     y_limits,
                     timesteps,
                     title,
                     legends,
                     no_display,
                     truncate_x=-1,
                     normalization=False,
                     figpath=None,
                     exp_name_dict=None):
    assert len(legends) == len(y_list)
    printGreen("{} Experiments".format(len(y_list)))

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)
    if truncate_x > 0:
        min_x = min(truncate_x, min_x)
    x = np.array(x_list[0][:min_x])
    # To reformulize the data by the min_x
    for i in range(len(y_list)):
        y_list[i] = y_list[i][:, :min_x]
    y_list = np.array(y_list)

    #print("Min, Max rewards:", np.min(y_list), np.max(y_list))

    # Normalize the data between 0 and 1.
    if (normalization):
        y_limits = [-0.05, 1.05]
        y_list = (y_list - np.min(y_list)) / (np.max(y_list) - np.min(y_list))

    colormap = plt.cm.tab20.colors
    registered_indexes = [0, 4, 6]
    registered_color = {
        'ground_truth': colormap[4],  # green
        'raw_pixels': colormap[0],  # blue
        'AE_ifr2_spcls_split': colormap[6],  # red
        'supervised': (0.0, 0.0, 0.0)  # black
    }
    # import ipdb; ipdb.set_trace()
    new_colormap = tuple([
        colormap[k] for k in range(len(colormap))
        if k not in registered_indexes
    ])
    fig = plt.figure(title, figsize=(20, 10))
    for i in range(len(y_list)):
        label = legends[i]
        y = y_list[i][:, :min_x]

        print('{}: {} experiments'.format(label, len(y)))
        # Compute mean for different seeds
        m = np.mean(y, axis=0)
        # Compute standard error
        s = np.squeeze(np.asarray(np.std(y, axis=0)))
        n = y.shape[0]
        exp_name = exp_name_dict[i]
        color = registered_color.get(
            exp_name, new_colormap[i]
        )  # get color if exp_name is registered, otherwise, new color
        plt.fill_between(x,
                         m - s / np.sqrt(n),
                         m + s / np.sqrt(n),
                         color=color,
                         alpha=0.3)
        plt.plot(x, m, color=color, label=label, linewidth=2)

    if timesteps:
        formatter = FuncFormatter(millions)
        plt.xlabel('Number of Timesteps')
        fig.axes[0].xaxis.set_major_formatter(formatter)
    else:
        plt.xlabel('Number of Episodes')
    if (normalization):
        plt.ylabel('Normalized Rewards')
    else:
        plt.ylabel('Rewards')
    plt.title(title, **fontstyle)
    plt.ylim(y_limits)

    plt.legend(framealpha=0.8,
               frameon=True,
               labelspacing=0.01,
               loc='lower right',
               fontsize=16)
    if figpath is not None:
        plt.savefig(figpath)
    if not no_display:
        plt.show()
示例#20
0
def loadSRLModel(path=None,
                 cuda=False,
                 state_dim=None,
                 env_object=None,
                 img_shape=None):
    """
    Load a trained SRL model, it will try to guess the model type from the path
    :param path: (str) Path to a srl model
    :param cuda: (bool)
    :param state_dim: (int)
    :param env_object: (gym env object)
    :return: (srl model)
    """

    model_type, losses, n_actions, model = None, None, None, None

    if path is not None:
        # Get path to the log folder
        log_folder = '/'.join(path.split('/')[:-1]) + '/'
        with open(log_folder + 'exp_config.json', 'r') as f:
            # IMPORTANT: keep the order for the losses
            # so the json is loaded as an OrderedDict
            exp_config = json.load(f, object_pairs_hook=OrderedDict)

        state_dim = exp_config.get('state-dim', None)
        losses = exp_config.get(
            'losses', None)  # None in the case of baseline models (pca)
        n_actions = exp_config.get(
            'n_actions', None)  # None in the case of baseline models (pca)
        model_type = exp_config.get('model-type', None)
        use_multi_view = exp_config.get('multi-view', False)
        inverse_model_type = exp_config.get('inverse-model-type', 'linear')
        num_dataset_episodes = exp_config.get('num_dataset_episodes', 100)
        assert state_dim is not None, \
            "Please make sure you are loading an up to date model with a conform exp_config file."

        split_dimensions = exp_config.get('split-dimensions')
        if isinstance(split_dimensions, OrderedDict):
            n_dims = sum(split_dimensions.values())
            # Combine losses instead of splitting
            if n_dims == 0:
                split_dimensions = None
    else:
        assert env_object is not None or state_dim > 0, \
            "When learning states, state_dim must be > 0. Otherwise, set SRL_MODEL_PATH \
            to a srl_model.pth file with learned states."

    if path is not None:
        if 'baselines' in path:
            if 'pca' in path:
                model_type = 'pca'
                model = SRLPCA(state_dim)

    assert model_type is not None or model is not None, \
        "Model type not supported. In order to use loadSRLModel, a path to an SRL model must be given."
    assert not (losses is None and not model_type == 'pca'), \
        "Please make sure you are loading an up to date model with a conform exp_config file."
    assert not (n_actions is None and not (model_type == 'pca')), \
        "Please make sure you are loading an up to date model with a conform exp_config file."
    if model is None:
        if use_multi_view:
            new_img_shape = (6, ) + img_shape[1:]
        else:
            new_img_shape = img_shape
        model = SRLNeuralNetwork(state_dim,
                                 cuda,
                                 img_shape=new_img_shape,
                                 model_type=model_type,
                                 n_actions=n_actions,
                                 losses=losses,
                                 split_dimensions=split_dimensions,
                                 spcls_num_classes=num_dataset_episodes,
                                 inverse_model_type=inverse_model_type)

    model_name = model_type
    if 'baselines' not in path:
        model_name += " with " + ", ".join(losses)
    printGreen("\nSRL: Using {} \n".format(model_name))

    if path is not None:
        printYellow("Loading trained model...{}".format(path))
        model.load(path)
    return model
示例#21
0
def prog_mlp_extractor(flat_observations,
                       net_arch,
                       act_fun,
                       dict_res_tensor_ph,
                       n_col=0):
    latent = flat_observations
    policy_only_layers = [
    ]  # Layer sizes of the network that only belongs to the policy network
    value_only_layers = [
    ]  # Layer sizes of the network that only belongs to the value network

    for idx, layer in enumerate(net_arch):
        if isinstance(layer, int):  # Check that this is a shared layer
            layer_size = layer
            latent = act_fun(
                linear(latent,
                       "shared_fc{}".format(idx),
                       layer_size,
                       init_scale=np.sqrt(2)))
        else:
            if 'pi' in layer:
                assert isinstance(
                    layer['pi'], list
                ), "Error: net_arch[-1]['pi'] must contain a list of integers."
                policy_only_layers = layer['pi']

            if 'vf' in layer:
                assert isinstance(
                    layer['vf'], list
                ), "Error: net_arch[-1]['vf'] must contain a list of integers."
                value_only_layers = layer['vf']
            break  # From here on the network splits up in policy and value network

    # Build the non-shared part of the network
    latent_policy = latent
    latent_value = latent
    for idx, (pi_layer_size, vf_layer_size) in enumerate(
            zip_longest(policy_only_layers, value_only_layers)):
        if pi_layer_size is not None:
            assert isinstance(
                pi_layer_size,
                int), "Error: net_arch[-1]['pi'] must only contain integers."
            latent_policy = (linear(latent_policy,
                                    "pi_fc{}".format(idx),
                                    pi_layer_size,
                                    init_scale=np.sqrt(2)))
            if (n_col > 0):
                with tf.variable_scope("pi_res_{}".format(idx),
                                       reuse=tf.AUTO_REUSE):
                    print(latent_policy.name)
                    # and"train_model" in latent_policy.name):
                    res_pi_ph = dict_res_tensor_ph[latent_policy.name.split(
                        ":")[0]]
                    printGreen(res_pi_ph)
                    res_len = res_pi_ph.shape[1]
                    U = tf.get_variable(
                        name="U{}".format(idx),
                        shape=[res_len, pi_layer_size],
                        initializer=tf.constant_initializer(1.))
                    latent_policy += tf.matmul(res_pi_ph, U)

            latent_policy = act_fun(latent_policy)

        if vf_layer_size is not None:
            assert isinstance(
                vf_layer_size,
                int), "Error: net_arch[-1]['vf'] must only contain integers."

            latent_value = (linear(latent_value,
                                   "vf_fc{}".format(idx),
                                   vf_layer_size,
                                   init_scale=np.sqrt(2)))

            if (n_col > 0):
                with tf.variable_scope("vf_res_{}".format(idx),
                                       reuse=tf.AUTO_REUSE):
                    res_vf_ph = dict_res_tensor_ph[latent_value.name.split(":")
                                                   [0]]
                    res_len = res_vf_ph.shape[1]
                    U = tf.get_variable(
                        name="U{}".format(idx),
                        shape=[res_len, vf_layer_size],
                        initializer=tf.constant_initializer(1.))
                    latent_value += tf.matmul(res_vf_ph, U)
            latent_value = act_fun(latent_value)

    return latent_policy, latent_value
示例#22
0
def loadConfigAndSetup(load_args):
    """
    Get the training config and setup the parameters
    :param load_args: (Arguments)
    :return: (dict, str, str, str, dict)
    """
    algo_name = ""
    for algo in list(registered_rl.keys()):
        if algo in load_args.log_dir:
            algo_name = algo
            break
    algo_class, algo_type, _ = registered_rl[algo_name]
    if algo_type == AlgoType.OTHER:
        raise ValueError(algo_name + " is not supported for replay")
    printGreen("\n" + algo_name + "\n")

    load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name)

    env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r'))
    train_args = json.load(open(load_args.log_dir + "args.json", 'r'))
    if train_args.get("img_shape", None) is None:
        img_shape = None  #(3,224,224)
    else:
        img_shape = tuple(
            map(int,
                train_args.get("img_shape", None)[1:-1].split(",")))

    env_kwargs = {
        "renders": load_args.render,
        "shape_reward": load_args.shape_reward,  # Reward sparse or shaped
        "action_joints": train_args["action_joints"],
        "is_discrete": not train_args["continuous_actions"],
        "random_target": train_args.get('random_target', False),
        "srl_model": train_args["srl_model"],
        "img_shape": img_shape
        # "img_shape" : train_args.get("img_shape", None)
    }

    # load it, if it was defined
    if "action_repeat" in env_globals:
        env_kwargs["action_repeat"] = env_globals['action_repeat']

    # Remove up action
    if train_args["env"] == "Kuka2ButtonGymEnv-v0":
        env_kwargs["force_down"] = env_globals.get('force_down', True)
    else:
        env_kwargs["force_down"] = env_globals.get('force_down', False)

    if train_args["env"] == "OmnirobotEnv-v0":
        env_kwargs["simple_continual_target"] = env_globals.get(
            "simple_continual_target", False)
        env_kwargs["circular_continual_move"] = env_globals.get(
            "circular_continual_move", False)
        env_kwargs["square_continual_move"] = env_globals.get(
            "square_continual_move", False)
        env_kwargs["eight_continual_move"] = env_globals.get(
            "eight_continual_move", False)

        # If overriding the environment for specific Continual Learning tasks
        if sum([
                load_args.simple_continual, load_args.circular_continual,
                load_args.square_continual
        ]) >= 1:
            env_kwargs["simple_continual_target"] = load_args.simple_continual
            env_kwargs[
                "circular_continual_move"] = load_args.circular_continual
            env_kwargs["square_continual_move"] = load_args.square_continual
            env_kwargs["random_target"] = not (load_args.circular_continual
                                               or load_args.square_continual)

    srl_model_path = None
    if train_args["srl_model"] != "raw_pixels":
        train_args["policy"] = "mlp"
        path = env_globals.get('srl_model_path')

        if path is not None:
            env_kwargs["use_srl"] = True
            # Check that the srl saved model exists on the disk
            assert os.path.isfile(
                env_globals['srl_model_path']), "{} does not exist".format(
                    env_globals['srl_model_path'])
            srl_model_path = env_globals['srl_model_path']
            env_kwargs["srl_model_path"] = srl_model_path

    return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
示例#23
0
def plotGatheredExperiments(folders,
                            algo,
                            y_limits,
                            window=40,
                            title="",
                            min_num_x=-1,
                            timesteps=False,
                            output_file="",
                            no_display=False):
    """
    Compute mean and standard error for several experiments and plot the learning curve
    :param folders: ([str]) Log folders, where the monitor.csv are stored
    :param window: (int) Smoothing window
    :param algo: (str) name of the RL algo
    :param title: (str) plot title
    :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum)
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param y_limits: ([float]) y-limits for the plot
    :param output_file: (str) Path to a file where the plot data will be saved
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    y_list = []
    x_list = []
    ok = False
    for folder in folders:
        if timesteps:
            x, y = loadData(folder, smooth=1, bin_size=100)
            if x is not None:
                x, y = np.array(x), np.array(y)
        else:
            x, y = loadEpisodesData(folder)

        if x is None or (min_num_x > 0 and y.shape[0] < min_num_x):
            printYellow("Skipping {}".format(folder))
            continue

        if y.shape[0] <= window:
            printYellow("Folder {}".format(folder))
            printYellow(
                "Not enough episodes for current window size = {}".format(
                    window))
            continue
        ok = True
        y = movingAverage(y, window)
        y_list.append(y)

        # Truncate x
        x = x[len(x) - len(y):]
        x_list.append(x)

    if not ok:
        printRed("Not enough data to plot anything with current config." +
                 " Consider decreasing --min-x")
        return

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)

    print("Min x: {}".format(min_x))
    print("Max x: {}".format(max_x))

    for i in range(len(x_list)):
        x_list[i] = x_list[i][:min_x]
        y_list[i] = y_list[i][:min_x]

    x = np.array(x_list)[0]
    y = np.array(y_list)

    printGreen("{} Experiments".format(y.shape[0]))
    print("Min, Max rewards:", np.min(y), np.max(y))

    fig = plt.figure(title)
    # Compute mean for different seeds
    m = np.mean(y, axis=0)
    # Compute standard error
    s = np.squeeze(np.asarray(np.std(y, axis=0)))
    n = y.shape[0]
    plt.fill_between(x,
                     m - s / np.sqrt(n),
                     m + s / np.sqrt(n),
                     color=lightcolors[0])
    plt.plot(x, m, color=darkcolors[0], label=algo, linewidth=1)

    if timesteps:
        formatter = FuncFormatter(millions)
        plt.xlabel('Number of Timesteps')
        fig.axes[0].xaxis.set_major_formatter(formatter)
    else:
        plt.xlabel('Number of Episodes')
    plt.ylabel('Rewards')

    plt.title(title, **fontstyle)
    plt.ylim(y_limits)

    plt.legend(framealpha=0.5,
               labelspacing=0.01,
               loc='lower right',
               fontsize=16)

    if output_file != "":
        printGreen("Saving aggregated data to {}.npz".format(output_file))
        np.savez(output_file, x=x, y=y)

    if not no_display:
        plt.show()
示例#24
0
文件: train.py 项目: s206283/gcrl
def main():
    # Global variables for callback
    global ENV_NAME, ALGO, ALGO_NAME, LOG_INTERVAL, VISDOM_PORT, viz
    global SAVE_INTERVAL, EPISODE_WINDOW, MIN_EPISODES_BEFORE_SAVE
    parser = argparse.ArgumentParser(description="Train script for RL algorithms")
    parser.add_argument('--algo', default='ppo2', choices=list(registered_rl.keys()), help='RL algo to use',
                        type=str)
    parser.add_argument('--env', type=str, help='environment ID', default='KukaButtonGymEnv-v0',
                        choices=list(registered_env.keys()))
    parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)')
    parser.add_argument('--episode_window', type=int, default=40,
                        help='Episode window for moving average plot (default: 40)')
    parser.add_argument('--log-dir', default='/tmp/gym/', type=str,
                        help='directory to save agent logs and model (default: /tmp/gym)')
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--srl-model', type=str, default='raw_pixels', choices=list(registered_srl.keys()),
                        help='SRL model to use')
    parser.add_argument('--num-stack', type=int, default=1, help='number of frames to stack (default: 1)')
    parser.add_argument('--action-repeat', type=int, default=1,
                        help='number of times an action will be repeated (default: 1)')
    parser.add_argument('--port', type=int, default=8097, help='visdom server port (default: 8097)')
    parser.add_argument('--no-vis', action='store_true', default=False, help='disables visdom visualization')
    parser.add_argument('--shape-reward', action='store_true', default=False,
                        help='Shape the reward (reward = - distance) instead of a sparse reward')
    parser.add_argument('-c', '--continuous-actions', action='store_true', default=False)
    parser.add_argument('-joints', '--action-joints', action='store_true', default=False,
                        help='set actions to the joints of the arm directly, instead of inverse kinematics')
    parser.add_argument('-r', '--random-target', action='store_true', default=False,
                        help='Set the button to a random position')
    parser.add_argument('--srl-config-file', type=str, default="config/srl_models.yaml",
                        help='Set the location of the SRL model path configuration.')
    parser.add_argument('--hyperparam', type=str, nargs='+', default=[])
    parser.add_argument('--min-episodes-save', type=int, default=100,
                        help="Min number of episodes before saving best model")
    parser.add_argument('--latest', action='store_true', default=False,
                        help='load the latest learned model (location:srl_zoo/logs/DatasetName/)')
    parser.add_argument('--load-rl-model-path', type=str, default=None,
                        help="load the trained RL model, should be with the same algorithm type")
    
    # Ignore unknown args for now
    args, unknown = parser.parse_known_args()
    env_kwargs = {}

    # LOAD SRL models list
    assert os.path.exists(args.srl_config_file), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file)
    with open(args.srl_config_file, 'rb') as f:
        all_models = yaml.load(f)

    # Sanity check
    assert args.episode_window >= 1, "Error: --episode_window cannot be less than 1"
    assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1"
    assert args.num_stack >= 1, "Error: --num-stack cannot be less than 1"
    assert args.action_repeat >= 1, "Error: --action-repeat cannot be less than 1"
    assert 0 <= args.port < 65535, "Error: invalid visdom port number {}, ".format(args.port) + \
                                   "port number must be an unsigned 16bit number [0,65535]."
    assert registered_srl[args.srl_model][0] == SRLType.ENVIRONMENT or args.env in all_models, \
        "Error: the environment {} has no srl_model defined in 'srl_models.yaml'. Cannot continue.".format(args.env)
    # check that all the SRL_model can be run on the environment
    if registered_srl[args.srl_model][1] is not None:
        found = False
        for compatible_class in registered_srl[args.srl_model][1]:
            if issubclass(compatible_class, registered_env[args.env][0]):
                found = True
                break
        assert found, "Error: srl_model {}, is not compatible with the {} environment.".format(args.srl_model, args.env)

    ENV_NAME = args.env
    ALGO_NAME = args.algo
    VISDOM_PORT = args.port
    EPISODE_WINDOW = args.episode_window
    MIN_EPISODES_BEFORE_SAVE = args.min_episodes_save

    if args.no_vis:
        viz = False

    algo_class, algo_type, action_type = registered_rl[args.algo]
    algo = algo_class()
    ALGO = algo
    

    # if callback frequency needs to be changed
    LOG_INTERVAL = algo.LOG_INTERVAL
    SAVE_INTERVAL = algo.SAVE_INTERVAL

    if not args.continuous_actions and ActionType.DISCRETE not in action_type:
        raise ValueError(args.algo + " does not support discrete actions, please use the '--continuous-actions' " +
                         "(or '-c') flag.")
    if args.continuous_actions and ActionType.CONTINUOUS not in action_type:
        raise ValueError(args.algo + " does not support continuous actions, please remove the '--continuous-actions' " +
                         "(or '-c') flag.")

    env_kwargs["is_discrete"] = not args.continuous_actions

    printGreen("\nAgent = {} \n".format(args.algo))

    env_kwargs["action_repeat"] = args.action_repeat
    # Random init position for button
    env_kwargs["random_target"] = args.random_target
    # Allow up action
    # env_kwargs["force_down"] = False

    # allow multi-view
    env_kwargs['multi_view'] = args.srl_model == "multi_view_srl"
    parser = algo.customArguments(parser)
    args = parser.parse_args()

    args, env_kwargs = configureEnvAndLogFolder(args, env_kwargs, all_models)
    args_dict = filterJSONSerializableObjects(vars(args))
    # Save args
    with open(LOG_DIR + "args.json", "w") as f:
        json.dump(args_dict, f)

    env_class = registered_env[args.env][0]
    # env default kwargs
    default_env_kwargs = {k: v.default
                          for k, v in inspect.signature(env_class.__init__).parameters.items()
                          if v is not None}

    globals_env_param = sys.modules[env_class.__module__].getGlobals()

    super_class = registered_env[args.env][1]
    # reccursive search through all the super classes of the asked environment, in order to get all the arguments.
    rec_super_class_lookup = {dict_class: dict_super_class for _, (dict_class, dict_super_class, _, _) in
                              registered_env.items()}
    while super_class != SRLGymEnv:
        assert super_class in rec_super_class_lookup, "Error: could not find super class of {}".format(super_class) + \
                                                      ", are you sure \"registered_env\" is correctly defined?"
        super_env_kwargs = {k: v.default
                            for k, v in inspect.signature(super_class.__init__).parameters.items()
                            if v is not None}
        default_env_kwargs = {**super_env_kwargs, **default_env_kwargs}

        globals_env_param = {**sys.modules[super_class.__module__].getGlobals(), **globals_env_param}

        super_class = rec_super_class_lookup[super_class]

    # Print Variables
    printYellow("Arguments:")
    pprint(args_dict)
    printYellow("Env Globals:")
    pprint(filterJSONSerializableObjects({**globals_env_param, **default_env_kwargs, **env_kwargs}))
    # Save env params
    saveEnvParams(globals_env_param, {**default_env_kwargs, **env_kwargs})
    # Seed tensorflow, python and numpy random generator
    set_global_seeds(args.seed)
    # Augment the number of timesteps (when using mutliprocessing this number is not reached)
    args.num_timesteps = int(1.1 * args.num_timesteps)
    # Get the hyperparameter, if given (Hyperband)
    hyperparams = {param.split(":")[0]: param.split(":")[1] for param in args.hyperparam}
    hyperparams = algo.parserHyperParam(hyperparams)
    
    if args.load_rl_model_path is not None:
        #use a small learning rate
        print("use a small learning rate: {:f}".format(1.0e-4))
        hyperparams["learning_rate"] = lambda f: f * 1.0e-4
        
    # Train the agent

    if args.load_rl_model_path is not None:
        algo.setLoadPath(args.load_rl_model_path)
    algo.train(args, callback, env_kwargs=env_kwargs, train_kwargs=hyperparams)