예제 #1
0
def configure(folder: str,
              format_strs: Optional[Sequence[str]] = None) -> None:
    """Configure Stable Baselines logger to be `accumulate_means()`-compatible.

    After this function is called, `stable_baselines.logger.{configure,reset}()`
    are replaced with stubs that raise RuntimeError.

    Args:
        folder: Argument from `stable_baselines.logger.configure`.
        format_strs: An list of output format strings. For details on available
          output formats see `stable_baselines.logger.make_output_format`.
    """
    # Replace `stable_baselines.logger` methods with erroring stubs to
    # prevent unexpected logging state from mixed logging configuration.
    sb_logger.configure = _sb_logger_configure_replacement
    sb_logger.reset = _sb_logger_reset_replacement

    if format_strs is None:
        format_strs = ["stdout", "log", "csv"]
    output_formats = _build_output_formats(folder, format_strs)
    default_logger = sb_logger.Logger(folder, output_formats)
    hier_logger = _HierarchicalLogger(default_logger, format_strs)
    sb_logger.Logger.CURRENT = hier_logger
    sb_logger.log("Logging to %s" % folder)
    assert is_configured()
def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir):

    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)



    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out
    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")
    model.set_from_flat(start_theta)

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)


    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
예제 #3
0
def train(args):
    total_timesteps = int(args.num_timesteps)
    seed = args.seed

    # get params
    alg_kwargs, policy_kwargs = get_params(args)

    env = build_env(args)
    policy = get_policy(args)

    # if args.use_typeVector:
    #     model = policy(CnnVectorPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **alg_kwargs)
    # else:
    #     model = policy(CnnPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **alg_kwargs)
    model = policy(CnnPolicy,
                   env,
                   verbose=1,
                   policy_kwargs=policy_kwargs,
                   **alg_kwargs)

    model.learn(
        total_timesteps=total_timesteps,
        log_interval=args.log_interval,
        # save_interval=args.save_interval
    )

    logger.log('Trained Over.')
    return model, env
예제 #4
0
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()


    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir( this_run_dir)


    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)


    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''
    result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir,
                    proj=False,
                    origin="mean_param", use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=True)
    logger.debug("after pca")

    final_pcs = result["first_n_pcs"]

    all_param_iterator = get_allinone_concat_df(dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size)
    plane_angles_vs_final_plane_along_the_way = []
    ipca = IncrementalPCA(n_components=cma_args.n_comp_to_use)  # for sparse PCA to speed up
    for chunk in all_param_iterator:

        logger.log(f"currently at {all_param_iterator._currow}")
        ipca.partial_fit(chunk.values)

        first_n_pcs = ipca.components_[:cma_args.n_comp_to_use]
        assert final_pcs.shape[0] == first_n_pcs.shape[0]


        plane_angle = cal_angle_between_nd_planes(first_n_pcs, final_pcs)
        plane_angles_vs_final_plane_along_the_way.append(plane_angle)


    plot_dir = get_plot_dir(cma_args)
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)
    plane_angles_vs_final_plane_plot_dir = get_plane_angles_vs_final_plane_along_the_way_plot_dir(plot_dir, cma_args.n_comp_to_use)
    if not os.path.exists(plane_angles_vs_final_plane_plot_dir):
        os.makedirs(plane_angles_vs_final_plane_plot_dir)




    angles_plot_name = f"plane_angles_vs_final_plane_plot_dir "
    plot_2d(plane_angles_vs_final_plane_plot_dir, angles_plot_name, np.arange(len(plane_angles_vs_final_plane_along_the_way)), plane_angles_vs_final_plane_along_the_way, "num of chunks", "angle with diff in degrees", False)
예제 #5
0
def do_cma(cma_args, first_n_pcs, orgin_param, save_dir, starting_coord, var):

    tic = time.time()

    #TODO better starting locations, record how many samples,

    logger.log(f"CMAES STARTING :{starting_coord}")
    es = cma.CMAEvolutionStrategy(starting_coord, var)
    total_num_of_evals = 0
    total_num_timesteps = 0

    mean_rets = []
    min_rets = []
    max_rets = []
    eval_returns = None

    optimization_path = []
    while total_num_timesteps < cma_args.cma_num_timesteps and not es.stop():
        solutions = es.ask()
        optimization_path.extend(solutions)
        thetas = [
            np.matmul(coord, first_n_pcs) + orgin_param for coord in solutions
        ]
        logger.log(
            f"current time steps num: {total_num_timesteps} total time steps: {cma_args.cma_num_timesteps}"
        )
        eval_returns = Parallel(n_jobs=cma_args.cores_to_use) \
            (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for
             (i, theta) in enumerate(thetas))

        mean_rets.append(np.mean(eval_returns))
        min_rets.append(np.min(eval_returns))
        max_rets.append(np.max(eval_returns))

        total_num_of_evals += len(eval_returns)
        total_num_timesteps += cma_args.eval_num_timesteps * len(eval_returns)

        logger.log(f"current eval returns: {str(eval_returns)}")
        logger.log(f"total timesteps so far: {total_num_timesteps}")
        negative_eval_returns = [-r for r in eval_returns]

        es.tell(solutions, negative_eval_returns)
        es.logger.add()  # write data to disc to be plotted
        es.disp()

    toc = time.time()
    logger.log(
        f"####################################CMA took {toc-tic} seconds")

    es_logger = es.logger

    if not hasattr(es_logger, 'xmean'):
        es_logger.load()

    n_comp_used = first_n_pcs.shape[0]
    optimization_path_mean = np.vstack(
        (starting_coord, es_logger.xmean[:, 5:5 + n_comp_used]))

    return mean_rets, min_rets, max_rets, np.array(
        optimization_path), np.array(optimization_path_mean)
예제 #6
0
def main():

    # Parse command line args
    parser = arg_parser()
    parser.add_argument("-ns", "--num-timesteps", type=str, default="1e6")
    parser.add_argument("-hw", "--use-hardware", action="store_true")
    parser.add_argument("-ld", "--logdir", type=str, default="logs")
    parser.add_argument("-l", "--load", type=str, default=None)
    parser.add_argument("-s", "--save", action="store_true")
    parser.add_argument("-si", "--save-interval", type=float, default=5e4)
    parser.add_argument("-p", "--play", action="store_true")
    parser.add_argument("-sd", "--seed", type=int, default=-1)
    parser.add_argument(
        "-o", "--output-formats", nargs="*", default=["stdout", "log", "csv"]
    )
    args = parser.parse_args()

    # Set default seed
    if args.seed == -1:
        seed = np.random.randint(1, 1000)
        print("Seed is", seed)
    else:
        seed = args.seed

    device_type = "hardware" if args.use_hardware else "simulator"
    logdir = "{}/{}/{}/{}/seed-{}".format(
        args.logdir, device_type, "QubeSwingupEnv", args.num_timesteps, str(seed)
    )

    logger.configure(logdir, args.output_formats)

    # Round save interval to a multiple of 2048
    save_interval = int(np.ceil(args.save_interval / 2048)) if args.save else 0

    # Run training script (+ loading/saving)
    model, env = train(
        QubeSwingupEnv,
        num_timesteps=int(float(args.num_timesteps)),
        hardware=args.use_hardware,
        logdir=logdir,
        save=args.save,
        save_interval=save_interval,
        load=args.load,
        seed=seed,
    )

    if args.play:
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]
            if not args.use_hardware:
                env.render()

    env.close()
예제 #7
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(args)
    plot_dir_alg = get_plot_dir(args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir,
                                                      params_scope="pi")
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    model.set_pi_from_flat(final_params)

    if args.normalize:
        env.load_running_average(save_dir)

    obz_tensor = model.act_model.fake_input_tensor

    some_neuron = model.act_model.policy_neurons[2][-1]

    grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor)

    grads = list(zip(grads, obz_tensor))

    trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5)

    train_op = trainer.apply_gradients(grads)
    for i in range(10000):
        obz, _ = model.sess.run([obz_tensor, train_op])
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()
    run_nums = cma_args.run_nums_to_check
    run_nums = [int(run_num) for run_num in run_nums.split(":")]

    final_params_list = []
    start_params_list = []

    for run_num in run_nums:
        cma_args.run_num = run_num
        if os.path.exists(get_dir_path_for_this_run(cma_args)):

            this_run_dir = get_dir_path_for_this_run(cma_args)
            plot_dir_alg = get_plot_dir(cma_args)

            traj_params_dir_name = get_full_params_dir(this_run_dir)
            intermediate_data_dir = get_intermediate_data_dir(
                this_run_dir, params_scope="pi")
            save_dir = get_save_dir(this_run_dir)

            if not os.path.exists(intermediate_data_dir):
                os.makedirs(intermediate_data_dir)
            if not os.path.exists(plot_dir_alg):
                os.makedirs(plot_dir_alg)

            start_file = get_full_param_traj_file_path(traj_params_dir_name,
                                                       "pi_start")
            start_params = pd.read_csv(start_file, header=None).values[0]

            final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                                       "pi_final")
            final_params = pd.read_csv(final_file, header=None).values[0]

            final_params_list.append(final_params)
            start_params_list.append(start_params)

            cma_args.run_num += 1

    final_params_distances = []
    for i in range(len(final_params_list)):
        for j in range(i + 1, len(final_params_list)):
            final_params_distances.append(
                LA.norm(final_params_list[i] - final_params_list[j], ord=2))

    plot_dir = get_plot_dir(cma_args)
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)
    np.savetxt(f"{plot_dir}/final_params_distances.txt",
               final_params_distances,
               delimiter=",")
def do_eval_returns(plot_args,
                    intermediate_data_dir,
                    two_pcs_eval,
                    origin_param,
                    xcoordinates_to_eval,
                    ycoordinates_to_eval,
                    save_dir,
                    pca_center="final_param",
                    reuse=True):

    eval_string = f"xnum_{np.min(xcoordinates_to_eval)}:{np.max(xcoordinates_to_eval)}:{plot_args.xnum}_" \
                    f"ynum_{np.min(ycoordinates_to_eval)}:{np.max(ycoordinates_to_eval)}:{plot_args.ynum}"

    if not reuse or not os.path.exists(
            get_eval_returns_filename(intermediate_dir=intermediate_data_dir,
                                      eval_string=eval_string,
                                      n_comp=2,
                                      pca_center=pca_center)):

        from stable_baselines.ppo2.run_mujoco import eval_return
        thetas_to_eval = [
            origin_param + x * two_pcs_eval[0] + y * two_pcs_eval[1]
            for y in ycoordinates_to_eval for x in xcoordinates_to_eval
        ]

        tic = time.time()

        eval_returns = Parallel(n_jobs=plot_args.cores_to_use, max_nbytes='100M')\
            (delayed(eval_return)(plot_args, save_dir, theta, plot_args.eval_num_timesteps, i) for (i, theta) in enumerate(thetas_to_eval))
        toc = time.time()
        logger.log(
            f"####################################1st version took {toc-tic} seconds"
        )

        np.savetxt(get_eval_returns_filename(
            intermediate_dir=intermediate_data_dir,
            eval_string=eval_string,
            n_comp=2,
            pca_center=pca_center),
                   eval_returns,
                   delimiter=',')
    else:
        eval_returns = np.loadtxt(get_eval_returns_filename(
            intermediate_dir=intermediate_data_dir,
            eval_string=eval_string,
            n_comp=2,
            pca_center=pca_center),
                                  delimiter=',')

    return eval_returns
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''
    from stable_baselines.low_dim_analysis.common import do_pca, plot_2d

    origin = "mean_param"
    result = do_pca(cma_args.n_components,
                    cma_args.n_comp_to_use,
                    traj_params_dir_name,
                    intermediate_data_dir,
                    proj=False,
                    origin=origin,
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size)

    final_params = result["final_concat_params"]
    all_pcs = result["pcs_components"]

    logger.log("grab start params")
    start_file = get_full_param_traj_file_path(traj_params_dir_name, "start")
    start_params = pd.read_csv(start_file, header=None).values[0]

    angles = []
    for pc in all_pcs:
        angles.append(cal_angle(pc, final_params - start_params))

    plot_dir = get_plot_dir(cma_args)
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    angles_plot_name = f"angles with final - start start n_comp:{all_pcs.shape[0]} dim space of mean pca plane, "
    plot_2d(plot_dir, angles_plot_name, np.arange(all_pcs.shape[0]), angles,
            "num of pcs", "angle with diff", False)
예제 #11
0
 def log_info(self):
     """
     log the information of the dataset
     """
     logger.log("Total trajectorues: %d" % self.num_traj)
     logger.log("Total transitions: %d" % self.num_transition)
     logger.log("Average returns: %f" % self.avg_ret)
     logger.log("Std for returns: %f" % self.std_ret)
예제 #12
0
 def log_info(self):
     """
     Log the information of the dataset.
     """
     logger.log("Total trajectories: {}".format(self.num_traj))
     logger.log("Total transitions: {}".format(self.num_transition))
     logger.log("Average returns: {}".format(self.avg_ret))
     logger.log("Std for returns: {}".format(self.std_ret))
예제 #13
0
def plot_cma_returns(plot_dir_alg, name, mean_rets, min_rets, max_rets, show):

    X = np.arange(len(mean_rets))
    fig, ax = plt.subplots()
    plt.xlabel('num of eval')
    plt.ylabel('mean returns with min and max filled')

    ax.plot(X, mean_rets)
    ax.fill_between(X, min_rets, max_rets, alpha=0.5)
    file_path = f"{plot_dir_alg}/{name}.pdf"
    if os.path.isfile(file_path):
        os.remove(file_path)

    logger.log(f"saving cma plot to {file_path}")
    fig.savefig(file_path, dpi=300, bbox_inches='tight', format='pdf')
    if show: plt.show()
예제 #14
0
def main():
    """
    Runs the test
    """
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

    if args.play:
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]
            env.render('human')
def plot_2d(plot_dir_alg, name, X, Y, xlabel, ylabel, show):

    fig, ax = plt.subplots()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    ax.plot(X, Y)
    file_path = f"{plot_dir_alg}/{name}.pdf"
    if os.path.isfile(file_path):
        os.remove(file_path)

    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)
    logger.log(f"####saving to {file_path}")
    fig.savefig(file_path, dpi=300, bbox_inches='tight', format='pdf')
    if show: plt.show()
예제 #16
0
def train(args):
    total_timesteps = int(args.num_timesteps)
    seed = args.seed

    # get params
    alg_kwargs = get_params(args)

    env = build_env(args)

    model = PPO2(CnnPolicy, env, verbose=1, **alg_kwargs)
    model.learn(total_timesteps=total_timesteps,
                log_interval=args.log_interval,
                save_interval=args.save_interval)

    logger.log('Trained Over.')
    return model, env
예제 #17
0
def plot_2d_2(plot_dir_alg, name, X, grad_vs_v, pc1_vs_V, xlabel, ylabel, show):
    fig, ax = plt.subplots()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    ax.plot(X, grad_vs_v)
    ax.plot(X, pc1_vs_V)

    plt.legend(['in so far grad_vs_v', 'in so far pc1_vs_V'], loc='upper left')

    file_path = f"{plot_dir_alg}/{name}.pdf"
    if os.path.isfile(file_path):
        os.remove(file_path)
    logger.log(f"####saving to {file_path}")
    fig.savefig(file_path, dpi=300,
                bbox_inches='tight', format='pdf')
    if show: plt.show()
예제 #18
0
def main():
    """
    Runs the test
    """
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

    env = make_mujoco_env(args.env, args.seed)
    model = PPO1(MlpPolicy,
                 env,
                 timesteps_per_actorbatch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear')
    model.learn(total_timesteps=args.num_timesteps)

    model.save("ppo1")
    # env.close()

    del model  # remove to demonstrate saving and loading
    # env = make_mujoco_env(args.env, args.seed)

    model = PPO1.load("ppo1")
    logger.log("~!!!!!!!!")
    episode_rew = 0
    obs = env.reset()

    while True:
        action, _states = model.predict(obs)
        ob, reward, done, info = env.step(action)
        episode_rew += reward
        env.render()
        if done:
            print(f'episode_rew={episode_rew}')
            episode_rew = 0
            obs = env.reset()
예제 #19
0
    def __init__(self, env_fns, spaces=None):
        """
        If you don't specify observation_space, we'll have to create a dummy
        environment to get it.
        """
        if spaces:
            observation_space, action_space = spaces
        else:
            logger.log('Creating dummy env object to get spaces')
            with logger.scoped_configure(format_strs=[]):
                dummy = env_fns[0]()
                observation_space, action_space = dummy.observation_space, dummy.action_space
                dummy.close()
                del dummy
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)

        obs_spaces = observation_space.spaces if isinstance(
            self.observation_space,
            gym.spaces.Tuple) else (self.observation_space, )
        self.obs_bufs = [
            tuple(
                Array(_NP_TO_CT[s.dtype.type], int(np.prod(s.shape)))
                for s in obs_spaces) for _ in env_fns
        ]
        self.obs_shapes = [s.shape for s in obs_spaces]
        self.obs_dtypes = [s.dtype for s in obs_spaces]

        self.parent_pipes = []
        self.procs = []
        for i, (env_fn, obs_buf) in enumerate(zip(env_fns, self.obs_bufs)):
            wrapped_fn = CloudpickleWrapper(env_fn)
            parent_pipe, child_pipe = Pipe()
            proc = Process(target=_subproc_worker,
                           args=(child_pipe, parent_pipe, wrapped_fn, obs_buf,
                                 self.obs_shapes, i))
            proc.daemon = True
            self.procs.append(proc)
            self.parent_pipes.append(parent_pipe)
            proc.start()
            child_pipe.close()
        self.waiting_step = False
def main(origin="final_param"):

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    cma_args = {
        "alg": 'ppo2',
        "env": "DartHopper-v1",
        "num_timesteps": 5000,
        "normalize": True,
        "n_steps": 2048,
        "nminibatches": 32,
        "run_num": 0
    }
    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''
    from stable_baselines.low_dim_analysis.common import do_pca
    result = do_pca(cma_args.n_components,
                    cma_args.n_comp_to_use,
                    traj_params_dir_name,
                    intermediate_data_dir,
                    proj=True,
                    origin=origin,
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size)
예제 #21
0
def test(model_path, env, args):
    policy = get_policy(args)
    model = policy.load(model_path)

    test_episode = args.test_episode
    num_env = args.num_env
    take_nums = args.take_nums
    avg_reward = 0

    logger.log('Begin testing, total ' + str(test_episode * num_env) +
               ' episodes...')
    for i_episode in range(test_episode):
        obs = env.reset()

        for _ in range(take_nums):

            action, _states = model.predict(obs)

            obs, rewards, dones, info = env.step(action)
            avg_reward += np.sum(rewards)

    avg_reward /= (test_episode * num_env)
    logger.log('Average reward: ' + str(avg_reward))
예제 #22
0
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()


    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir( this_run_dir)


    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)


    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    logger.log("grab start params")
    start_file = get_full_param_traj_file_path(traj_params_dir_name, "start")
    start_params = pd.read_csv(start_file, header=None).values[0]

    V = final_params - start_params

    pcs_components = np.loadtxt(
        get_pcs_filename(intermediate_dir=intermediate_data_dir, n_comp=cma_args.num_comp_to_load), delimiter=',')

    angle = cal_angle(V, pcs_components[0])
    logger.log(f"@@@@@@@@@@@@ {angle}")
예제 #23
0
def test(model, env, args):
    logger.log("Test...")

    n_episode = args.test_episode
    num_env = args.num_env
    state = model.initial_state if hasattr(model, 'initial_state') else None
    dones = np.zeros((1, ))
    total_rewards = 0

    for i_episode in range(n_episode):
        obs = env.reset()

        for i in range(100):

            if state is not None:
                actions, _, state, _ = model.step(obs, S=state, M=dones)
            else:
                actions, _, _, _ = model.step(obs)

            obs, rew, done, info = env.step(actions)

            for r in rew:
                total_rewards += np.sum(r)

            done = done[0]
            if done:
                break

    avg_reward = total_rewards / (n_episode * num_env)

    if args.log:
        logger.log("Path: ", args.save_dir)
        logger.log("Test ", n_episode, " episodes, average reward is: ",
                   avg_reward)
        logger.log("Test over.")
    else:
        print("Test ", n_episode, " episodes, average reward is: ", avg_reward)
        print("Test over.")
예제 #24
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="PPO1",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            with self.sess.as_default():
                self.adam.sync()
                callback.on_training_start(locals(), globals())

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi,
                                                 self.env,
                                                 self.timesteps_per_actorbatch,
                                                 callback=callback)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                len_buffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                reward_buffer = deque(maxlen=100)

                while True:
                    if timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    seg = seg_gen.__next__()

                    # Stop training early (triggered by the callback)
                    if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                        break

                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    observations, actions = seg["observations"], seg["actions"]
                    atarg, tdlamret = seg["adv"], seg["tdlamret"]

                    # true_rew is the reward without discount
                    if writer is not None:
                        total_episode_reward_logger(
                            self.episode_reward, seg["true_rewards"].reshape(
                                (self.n_envs, -1)), seg["dones"].reshape(
                                    (self.n_envs, -1)), writer,
                            self.num_timesteps)

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(dict(ob=observations,
                                           ac=actions,
                                           atarg=atarg,
                                           vtarg=tdlamret),
                                      shuffle=not self.policy.recurrent)
                    optim_batchsize = self.optim_batchsize or observations.shape[
                        0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(
                                dataset.iterate_once(optim_batchsize)):
                            steps = (
                                self.num_timesteps + k * optim_batchsize +
                                int(i *
                                    (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if self.full_tensorboard_log and (1 +
                                                                  k) % 10 == 0:
                                    run_options = tf.compat.v1.RunOptions(
                                        trace_level=tf.compat.v1.RunOptions.
                                        FULL_TRACE)
                                    run_metadata = tf.compat.v1.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess,
                                        options=run_options,
                                        run_metadata=run_metadata)
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(
                                    batch["ob"],
                                    batch["ob"],
                                    batch["ac"],
                                    batch["atarg"],
                                    batch["vtarg"],
                                    cur_lrmult,
                                    sess=self.sess)

                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)
                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()
        callback.on_training_end()
        return self
예제 #25
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \
             reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
            self.tb_seen_steps = []

            rank = MPI.COMM_WORLD.Get_rank()
            # we assume symmetric actions.
            assert np.all(
                np.abs(self.env.action_space.low) ==
                self.env.action_space.high)
            if self.verbose >= 2:
                logger.log('Using agent with the following configuration:')
                logger.log(str(self.__dict__.items()))

            eval_episode_rewards_history = deque(maxlen=100)
            episode_rewards_history = deque(maxlen=100)
            self.episode_reward = np.zeros((1, ))
            episode_successes = []
            with self.sess.as_default(), self.graph.as_default():
                # Prepare everything.
                self._reset()
                obs = self.env.reset()
                eval_obs = None
                if self.eval_env is not None:
                    eval_obs = self.eval_env.reset()
                episode_reward = 0.
                episode_step = 0
                episodes = 0
                step = 0
                total_steps = 0

                start_time = time.time()

                epoch_episode_rewards = []
                epoch_episode_steps = []
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                eval_episode_rewards = []
                eval_qs = []
                epoch_actions = []
                epoch_qs = []
                epoch_episodes = 0
                epoch = 0
                while True:
                    for _ in range(log_interval):
                        # Perform rollouts.
                        for _ in range(self.nb_rollout_steps):
                            if total_steps >= total_timesteps:
                                return self

                            # Predict next action.
                            action, q_value = self._policy(obs,
                                                           apply_noise=True,
                                                           compute_q=True)
                            assert action.shape == self.env.action_space.shape

                            # Execute next action.
                            if rank == 0 and self.render:
                                self.env.render()

                            # Randomly sample actions from a uniform distribution
                            # with a probabilty self.random_exploration (used in HER + DDPG)
                            if np.random.rand() < self.random_exploration:
                                rescaled_action = action = self.action_space.sample(
                                )
                            else:
                                rescaled_action = action * np.abs(
                                    self.action_space.low)

                            rescaled_action = np.where(action)[0][0]
                            new_obs, reward, done, info = self.env.step(
                                rescaled_action)

                            if writer is not None:
                                ep_rew = np.array([reward]).reshape((1, -1))
                                ep_done = np.array([done]).reshape((1, -1))
                                self.episode_reward = total_episode_reward_logger(
                                    self.episode_reward, ep_rew, ep_done,
                                    writer, self.num_timesteps)
                            step += 1
                            total_steps += 1
                            self.num_timesteps += 1
                            if rank == 0 and self.render:
                                self.env.render()
                            episode_reward += reward
                            episode_step += 1

                            # Book-keeping.
                            epoch_actions.append(action)
                            epoch_qs.append(q_value)
                            self._store_transition(obs, action, reward,
                                                   new_obs, done)
                            obs = new_obs
                            if callback is not None:
                                # Only stop training if return value is False, not when it is None.
                                # This is for backwards compatibility with callbacks that have no return statement.
                                if callback(locals(), globals()) is False:
                                    return self

                            if done:
                                # Episode done.
                                epoch_episode_rewards.append(episode_reward)
                                episode_rewards_history.append(episode_reward)
                                epoch_episode_steps.append(episode_step)
                                episode_reward = 0.
                                episode_step = 0
                                epoch_episodes += 1
                                episodes += 1

                                maybe_is_success = info.get('is_success')
                                if maybe_is_success is not None:
                                    episode_successes.append(
                                        float(maybe_is_success))

                                self._reset()
                                if not isinstance(self.env, VecEnv):
                                    obs = self.env.reset()

                        # Train.
                        epoch_actor_losses = []
                        epoch_critic_losses = []
                        epoch_adaptive_distances = []
                        for t_train in range(self.nb_train_steps):
                            # Not enough samples in the replay buffer
                            if not self.replay_buffer.can_sample(
                                    self.batch_size):
                                break

                            # Adapt param noise, if necessary.
                            if len(self.replay_buffer) >= self.batch_size and \
                                    t_train % self.param_noise_adaption_interval == 0:
                                distance = self._adapt_param_noise()
                                epoch_adaptive_distances.append(distance)

                            # weird equation to deal with the fact the nb_train_steps will be different
                            # to nb_rollout_steps
                            step = (int(t_train * (self.nb_rollout_steps /
                                                   self.nb_train_steps)) +
                                    self.num_timesteps - self.nb_rollout_steps)

                            critic_loss, actor_loss = self._train_step(
                                step, writer, log=t_train == 0)
                            epoch_critic_losses.append(critic_loss)
                            epoch_actor_losses.append(actor_loss)
                            self._update_target_net()

                        # Evaluate.
                        eval_episode_rewards = []
                        eval_qs = []
                        if self.eval_env is not None:
                            eval_episode_reward = 0.
                            for _ in range(self.nb_eval_steps):
                                if total_steps >= total_timesteps:
                                    return self

                                eval_action, eval_q = self._policy(
                                    eval_obs,
                                    apply_noise=False,
                                    compute_q=True)
                                eval_obs, eval_r, eval_done, _ = self.eval_env.step(
                                    eval_action *
                                    np.abs(self.action_space.low))
                                if self.render_eval:
                                    self.eval_env.render()
                                eval_episode_reward += eval_r

                                eval_qs.append(eval_q)
                                if eval_done:
                                    if not isinstance(self.env, VecEnv):
                                        eval_obs = self.eval_env.reset()
                                    eval_episode_rewards.append(
                                        eval_episode_reward)
                                    eval_episode_rewards_history.append(
                                        eval_episode_reward)
                                    eval_episode_reward = 0.

                    mpi_size = MPI.COMM_WORLD.Get_size()
                    # Log stats.
                    # XXX shouldn't call np.mean on variable length lists
                    duration = time.time() - start_time
                    stats = self._get_stats()
                    combined_stats = stats.copy()
                    combined_stats['rollout/return'] = np.mean(
                        epoch_episode_rewards)
                    combined_stats['rollout/return_history'] = np.mean(
                        episode_rewards_history)
                    combined_stats['rollout/episode_steps'] = np.mean(
                        epoch_episode_steps)
                    combined_stats['rollout/actions_mean'] = np.mean(
                        epoch_actions)
                    combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
                    combined_stats['train/loss_actor'] = np.mean(
                        epoch_actor_losses)
                    combined_stats['train/loss_critic'] = np.mean(
                        epoch_critic_losses)
                    if len(epoch_adaptive_distances) != 0:
                        combined_stats['train/param_noise_distance'] = np.mean(
                            epoch_adaptive_distances)
                    combined_stats['total/duration'] = duration
                    combined_stats['total/steps_per_second'] = float(
                        step) / float(duration)
                    combined_stats['total/episodes'] = episodes
                    combined_stats['rollout/episodes'] = epoch_episodes
                    combined_stats['rollout/actions_std'] = np.std(
                        epoch_actions)
                    # Evaluation statistics.
                    if self.eval_env is not None:
                        combined_stats['eval/return'] = np.mean(
                            eval_episode_rewards)
                        combined_stats['eval/return_history'] = np.mean(
                            eval_episode_rewards_history)
                        combined_stats['eval/Q'] = np.mean(eval_qs)
                        combined_stats['eval/episodes'] = len(
                            eval_episode_rewards)

                    def as_scalar(scalar):
                        """
                        check and return the input if it is a scalar, otherwise raise ValueError

                        :param scalar: (Any) the object to check
                        :return: (Number) the scalar if x is a scalar
                        """
                        if isinstance(scalar, np.ndarray):
                            assert scalar.size == 1
                            return scalar[0]
                        elif np.isscalar(scalar):
                            return scalar
                        else:
                            raise ValueError('expected scalar, got %s' %
                                             scalar)

                    combined_stats_sums = MPI.COMM_WORLD.allreduce(
                        np.array(
                            [as_scalar(x) for x in combined_stats.values()]))
                    combined_stats = {
                        k: v / mpi_size
                        for (k, v) in zip(combined_stats.keys(),
                                          combined_stats_sums)
                    }

                    # Total statistics.
                    combined_stats['total/epochs'] = epoch + 1
                    combined_stats['total/steps'] = step

                    for key in sorted(combined_stats.keys()):
                        logger.record_tabular(key, combined_stats[key])
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.dump_tabular()
                    logger.info('')
                    logdir = logger.get_dir()
                    if rank == 0 and logdir:
                        if hasattr(self.env, 'get_state'):
                            with open(os.path.join(logdir, 'env_state.pkl'),
                                      'wb') as file_handler:
                                pickle.dump(self.env.get_state(), file_handler)
                        if self.eval_env and hasattr(self.eval_env,
                                                     'get_state'):
                            with open(
                                    os.path.join(logdir, 'eval_env_state.pkl'),
                                    'wb') as file_handler:
                                pickle.dump(self.eval_env.get_state(),
                                            file_handler)
예제 #26
0
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    all_param_iterator = get_allinone_concat_df(
        dir_name=traj_params_dir_name,
        use_IPCA=True,
        chunk_size=cma_args.pc1_chunk_size)
    all_grads_iterator = get_allinone_concat_df(
        dir_name=traj_params_dir_name,
        use_IPCA=True,
        chunk_size=cma_args.pc1_chunk_size,
        index="grads")

    angles_with_pc1_along_the_way = []
    grad_vs_final_min_current_param = []
    ipca = IncrementalPCA(1)  # for sparse PCA to speed up
    for chunk in all_param_iterator:

        logger.log(f"currently at {all_param_iterator._currow}")

        target_direction = final_params - chunk.values[-1]

        ipca.partial_fit(chunk.values)
        angle_with_pc1 = cal_angle(target_direction, ipca.components_[0])

        angles_with_pc1_along_the_way.append(angle_with_pc1)

        grads = all_grads_iterator.__next__().values
        for i, grad in enumerate(grads):

            grad_angle = cal_angle(grad, final_params - chunk.values[i])
            grad_vs_final_min_current_param.append(grad_angle)

    plot_dir = get_plot_dir(cma_args)
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    angles_plot_name = f"final - current VS so far pc1" \
                       f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size}"
    plot_2d(plot_dir, angles_plot_name,
            np.arange(len(angles_with_pc1_along_the_way)),
            angles_with_pc1_along_the_way, "num of chunks",
            "angle with diff in degrees", False)
    grad_vs_current_plot_name = f"##final - current param VS current grad" \
                       f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size}"
    plot_2d(plot_dir, grad_vs_current_plot_name,
            np.arange(len(grad_vs_final_min_current_param)),
            grad_vs_final_min_current_param, "num of chunks",
            "angle with diff in degrees", False)
예제 #27
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="TRPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards
                self.episode_reward = np.zeros((self.n_envs, ))

                true_reward_buffer = None
                if self.using_gail:
                    true_reward_buffer = deque(maxlen=40)

                    # Initialize dataloader
                    batchsize = self.timesteps_per_batch // self.d_step
                    self.expert_dataset.init_dataloader(batchsize)

                    #  Stats not used for now
                    # TODO: replace with normal tb logging
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, seg["true_rew"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                self.num_timesteps)

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata(
                            ) if self.full_tensorboard_log else None
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)
                                if self.full_tensorboard_log:
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)

                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("conjugate_gradient"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128,
                                        shuffle=True):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "explained_variance_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        assert len(observation) == self.timesteps_per_batch
                        batch_size = self.timesteps_per_batch // self.d_step

                        # NOTE: uses only the last g step for observation
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        # NOTE: for recurrent policies, use shuffle=False?
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size,
                                shuffle=True):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            )
                            # update running mean/std for reward_giver
                            if self.reward_giver.normalize:
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))

                            # Reshape actions if needed when using discrete actions
                            if isinstance(self.action_space,
                                          gym.spaces.Discrete):
                                if len(ac_batch.shape) == 2:
                                    ac_batch = ac_batch[:, 0]
                                if len(ac_expert.shape) == 2:
                                    ac_expert = ac_expert[:, 0]
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"],
                                    seg["ep_true_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*list_lr_pairs))
                        true_reward_buffer.extend(true_rets)
                    else:
                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"]
                                    )  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*list_lr_pairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    origin_name = "final_param"

    this_run_dir = get_dir_path_for_this_run(cma_args)
    plot_dir_alg = get_plot_dir(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir,
                                                      params_scope="pi")
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    start_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_start")
    start_params = pd.read_csv(start_file, header=None).values[0]
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''
    pca_indexes = cma_args.other_pca_index
    pca_indexes = [int(pca_index) for pca_index in pca_indexes.split(":")]

    n_comp_to_project_on = pca_indexes
    result = do_pca(n_components=cma_args.n_components,
                    traj_params_dir_name=traj_params_dir_name,
                    intermediate_data_dir=intermediate_data_dir,
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size,
                    reuse=True)
    logger.debug("after pca")

    if origin_name == "final_param":
        origin_param = result["final_params"]
    elif origin_name == "start_param":
        origin_param = start_params
    else:
        origin_param = result["mean_param"]

    proj_coords = project(result["pcs_components"],
                          pcs_slice=n_comp_to_project_on,
                          origin_name=origin_name,
                          origin_param=origin_param,
                          IPCA_chunk_size=cma_args.chunk_size,
                          traj_params_dir_name=traj_params_dir_name,
                          intermediate_data_dir=intermediate_data_dir,
                          n_components=cma_args.n_components,
                          reuse=True)
    '''
    ==========================================================================================
    eval all xy coords
    ==========================================================================================
    '''
    other_pcs_plot_dir = get_other_pcs_plane_plot_dir(plot_dir_alg,
                                                      pca_indexes)

    if not os.path.exists(other_pcs_plot_dir):
        os.makedirs(other_pcs_plot_dir)

    plot_3d_trajectory_path_only(
        other_pcs_plot_dir,
        f"{pca_indexes}_final_origin_3d_path_plot",
        proj_coords,
        explained_ratio=result["explained_variance_ratio"][pca_indexes])
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    # origin = "final_param"
    origin = cma_args.origin

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)

    cma_run_num, cma_intermediate_data_dir = generate_run_dir(
        get_cma_returns_dirname,
        intermediate_dir=intermediate_data_dir,
        n_comp=cma_args.n_comp_to_use)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''
    proj_or_not = (cma_args.n_comp_to_use == 2)
    result = do_pca(cma_args.n_components,
                    cma_args.n_comp_to_use,
                    traj_params_dir_name,
                    intermediate_data_dir,
                    proj=proj_or_not,
                    origin=origin,
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size,
                    reuse=False)
    '''
    ==========================================================================================
    eval all xy coords
    ==========================================================================================
    '''


    from stable_baselines.low_dim_analysis.common import plot_contour_trajectory, gen_subspace_coords,do_eval_returns\
        , do_proj_on_first_n

    if origin == "final_param":
        origin_param = result["final_concat_params"]
    else:
        origin_param = result["mean_param"]

    final_param = result["final_concat_params"]
    last_proj_coord = do_proj_on_first_n(final_param, result["first_n_pcs"],
                                         origin_param)
    starting_coord = last_proj_coord
    logger.log(f"CMA STASRTING CORRD: {starting_coord}")

    # starting_coord = (1/2*np.max(xcoordinates_to_eval), 1/2*np.max(ycoordinates_to_eval)) # use mean
    assert result["first_n_pcs"].shape[0] == cma_args.n_comp_to_use
    mean_rets, min_rets, max_rets, opt_path, opt_path_mean = do_cma(
        cma_args, result["first_n_pcs"], origin_param, save_dir,
        starting_coord, cma_args.cma_var)
    dump_rows_write_csv(cma_intermediate_data_dir, opt_path_mean,
                        "opt_mean_path")

    plot_dir = get_plot_dir(cma_args)
    cma_plot_dir = get_cma_plot_dir(plot_dir, cma_args.n_comp_to_use,
                                    cma_run_num, origin)
    if not os.path.exists(cma_plot_dir):
        os.makedirs(cma_plot_dir)

    ret_plot_name = f"cma return on {cma_args.n_comp_to_use} dim space of real pca plane, " \
                    f"explained {np.sum(result['explained_variance_ratio'][:cma_args.n_comp_to_use])}"
    plot_cma_returns(cma_plot_dir,
                     ret_plot_name,
                     mean_rets,
                     min_rets,
                     max_rets,
                     show=False)

    if cma_args.n_comp_to_use == 2:
        proj_coords = result["proj_coords"]
        assert proj_coords.shape[1] == 2

        xcoordinates_to_eval, ycoordinates_to_eval = gen_subspace_coords(
            cma_args,
            np.vstack((proj_coords, opt_path_mean)).T)

        eval_returns = do_eval_returns(cma_args,
                                       intermediate_data_dir,
                                       result["first_n_pcs"],
                                       origin_param,
                                       xcoordinates_to_eval,
                                       ycoordinates_to_eval,
                                       save_dir,
                                       pca_center=origin,
                                       reuse=False)

        plot_contour_trajectory(cma_plot_dir,
                                f"{origin}_origin_eval_return_contour_plot",
                                xcoordinates_to_eval,
                                ycoordinates_to_eval,
                                eval_returns,
                                proj_coords[:, 0],
                                proj_coords[:, 1],
                                result["explained_variance_ratio"][:2],
                                num_levels=25,
                                show=False,
                                sub_alg_path=opt_path_mean)

    opt_mean_path_in_old_basis = [
        mean_projected_param.dot(result["first_n_pcs"]) + result["mean_param"]
        for mean_projected_param in opt_path_mean
    ]
    distance_to_final = [
        LA.norm(opt_mean - final_param, ord=2)
        for opt_mean in opt_mean_path_in_old_basis
    ]
    distance_to_final_plot_name = f"distance_to_final over generations "
    plot_2d(cma_plot_dir, distance_to_final_plot_name,
            np.arange(len(distance_to_final)), distance_to_final,
            "num generation", "distance_to_final", False)
예제 #30
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    origin = "mean_param"

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)

    cma_run_num, cma_intermediate_data_dir = generate_run_dir(
        get_cma_returns_dirname,
        intermediate_dir=intermediate_data_dir,
        n_comp=cma_args.n_comp_to_use)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_param = pd.read_csv(final_file, header=None).values[0]

    final_pca = IncrementalPCA(n_components=2)  # for sparse PCA to speed up

    theta_file = get_full_param_traj_file_path(traj_params_dir_name, 0)
    concat_df = pd.read_csv(theta_file, header=None, chunksize=10000)

    tic = time.time()
    for chunk in concat_df:
        logger.log(f"currnet at : {concat_df._currow}")

        if chunk.shape[0] < 2:
            logger.log(f"last column too few: {chunk.shape[0]}")
            continue
        final_pca.partial_fit(chunk.values)

    toc = time.time()
    logger.log(
        '\nElapsed time computing the chunked PCA {:.2f} s\n'.format(toc -
                                                                     tic))

    logger.log(final_pca.explained_variance_ratio_)

    pcs_components = final_pca.components_

    first_2_pcs = pcs_components[:2]
    mean_param = final_pca.mean_

    origin_param = mean_param

    theta_file = get_full_param_traj_file_path(traj_params_dir_name, 0)
    concat_df = pd.read_csv(theta_file, header=None, chunksize=10000)

    proj_coords = do_proj_on_first_n_IPCA(concat_df, first_2_pcs, origin_param)
    '''
    ==========================================================================================
    eval all xy coords
    ==========================================================================================
    '''


    from stable_baselines.low_dim_analysis.common import plot_contour_trajectory, gen_subspace_coords,do_eval_returns, \
        get_allinone_concat_df, do_proj_on_first_n

    from stable_baselines.ppo2.run_mujoco import eval_return

    last_proj_coord = do_proj_on_first_n(final_param, first_2_pcs,
                                         origin_param)
    starting_coord = last_proj_coord

    tic = time.time()

    #TODO better starting locations, record how many samples,

    logger.log(f"CMAES STARTING :{starting_coord}")
    es = cma.CMAEvolutionStrategy(starting_coord, 5)
    total_num_of_evals = 0
    total_num_timesteps = 0

    mean_rets = []
    min_rets = []
    max_rets = []
    eval_returns = None

    optimization_path = []
    while total_num_timesteps < cma_args.cma_num_timesteps and not es.stop():
        solutions = es.ask()
        optimization_path.extend(solutions)
        thetas = [
            np.matmul(coord, first_2_pcs) + origin_param for coord in solutions
        ]
        logger.log(
            f"current time steps num: {total_num_timesteps} total time steps: {cma_args.cma_num_timesteps}"
        )
        eval_returns = Parallel(n_jobs=cma_args.cores_to_use) \
            (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for
             (i, theta) in enumerate(thetas))

        mean_rets.append(np.mean(eval_returns))
        min_rets.append(np.min(eval_returns))
        max_rets.append(np.max(eval_returns))

        total_num_of_evals += len(eval_returns)
        total_num_timesteps += cma_args.eval_num_timesteps * len(eval_returns)

        logger.log(f"current eval returns: {str(eval_returns)}")
        logger.log(f"total timesteps so far: {total_num_timesteps}")
        negative_eval_returns = [-r for r in eval_returns]

        es.tell(solutions, negative_eval_returns)
        es.logger.add()  # write data to disc to be plotted
        es.disp()

    toc = time.time()
    logger.log(
        f"####################################CMA took {toc-tic} seconds")

    es_logger = es.logger

    if not hasattr(es_logger, 'xmean'):
        es_logger.load()

    n_comp_used = first_2_pcs.shape[0]
    optimization_path_mean = np.vstack(
        (starting_coord, es_logger.xmean[:, 5:5 + n_comp_used]))

    dump_rows_write_csv(cma_intermediate_data_dir, optimization_path_mean,
                        "opt_mean_path")

    plot_dir = get_plot_dir(cma_args)
    cma_plot_dir = get_cma_plot_dir(plot_dir,
                                    cma_args.n_comp_to_use,
                                    cma_run_num,
                                    origin=origin)
    if not os.path.exists(cma_plot_dir):
        os.makedirs(cma_plot_dir)

    ret_plot_name = f"cma return on {cma_args.n_comp_to_use} dim space of real pca plane, " \
                    f"explained {np.sum(final_pca.explained_variance_ratio_[:2])}"
    plot_cma_returns(cma_plot_dir,
                     ret_plot_name,
                     mean_rets,
                     min_rets,
                     max_rets,
                     show=False)

    assert proj_coords.shape[1] == 2

    xcoordinates_to_eval, ycoordinates_to_eval = gen_subspace_coords(
        cma_args,
        np.vstack((proj_coords, optimization_path_mean)).T)

    from stable_baselines.ppo2.run_mujoco import eval_return
    thetas_to_eval = [
        origin_param + x * first_2_pcs[0] + y * first_2_pcs[1]
        for y in ycoordinates_to_eval for x in xcoordinates_to_eval
    ]

    tic = time.time()

    eval_returns = Parallel(n_jobs=-1, max_nbytes='100M') \
        (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for (i, theta) in
         enumerate(thetas_to_eval))
    toc = time.time()
    logger.log(
        f"####################################1st version took {toc-tic} seconds"
    )

    plot_contour_trajectory(
        cma_plot_dir,
        f"cma redo___{origin}_origin_eval_return_contour_plot",
        xcoordinates_to_eval,
        ycoordinates_to_eval,
        eval_returns,
        proj_coords[:, 0],
        proj_coords[:, 1],
        final_pca.explained_variance_ratio_,
        num_levels=25,
        show=False,
        sub_alg_path=optimization_path_mean.T)

    opt_mean_path_in_old_basis = [
        mean_projected_param.dot(first_2_pcs) + mean_param
        for mean_projected_param in optimization_path_mean
    ]
    distance_to_final = [
        LA.norm(opt_mean - final_param, ord=2)
        for opt_mean in opt_mean_path_in_old_basis
    ]
    distance_to_final_plot_name = f"cma redo distance_to_final over generations "
    plot_2d(cma_plot_dir, distance_to_final_plot_name,
            np.arange(len(distance_to_final)), distance_to_final,
            "num generation", "distance_to_final", False)