def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir):

    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)



    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out
    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")
    model.set_from_flat(start_theta)

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)


    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
예제 #2
0
def train(args):
    """
    Runs the test
    """
    args, argv = mujoco_arg_parser().parse_known_args(args)
    logger.log(f"#######TRAIN: {args}")
    args.alg = "ppo2"

    this_run_dir = get_dir_path_for_this_run(args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    logger.configure(log_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env.envs[0].env.env.disableViewer = True
    set_global_seeds(args.seed)
    env.envs[0].env.env.seed(args.seed)

    if args.normalize:
        env = VecNormalize(env)

    policy = MlpPolicy

    # extra run info I added for my purposes

    full_param_traj_dir_path = get_full_params_dir(this_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path,
        "state_samples_to_collect": args.state_samples_to_collect
    }

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=args.n_steps,
                 nminibatches=args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 optimizer=args.optimizer,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)
def visualize_augment_experiment(augment_num_timesteps,
                                 top_num_to_include_slice,
                                 augment_seed,
                                 augment_run_num,
                                 network_size,
                                 policy_env,
                                 policy_num_timesteps,
                                 policy_run_num,
                                 policy_seed,
                                 eval_seed,
                                 eval_run_num,
                                 learning_rate,
                                 additional_note,
                                 result_dir,
                                 lagrangian_inds_to_include=None):

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    # note this is only linear
    if lagrangian_inds_to_include is None:
        linear_top_vars_list = read_linear_top_var(policy_env,
                                                   policy_num_timesteps,
                                                   policy_run_num, policy_seed,
                                                   eval_seed, eval_run_num,
                                                   additional_note)

        # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode",
        #                    "com_jacobian", "contact_bodynode_jacobian"]
        keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"]
        # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice]
        lagrangian_inds_to_include = get_wanted_lagrangians(
            keys_to_include, linear_top_vars_list, top_num_to_include_slice)

    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include})

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = True

    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy

    # extra run info I added for my purposes
    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path
    }

    layers = [network_size, network_size]
    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=4096,
                 nminibatches=64,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=learning_rate,
                 cliprange=0.2,
                 optimizer='adam',
                 policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size,
                                policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
                                additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None,
                                linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None,
                                neurons_inds_to_include=None, use_lagrangian=True):
    trained_model = None
    if not use_lagrangian:
        with tf.variable_scope("trained_model"):
            common_arg_parser = get_common_parser()
            trained_args, cma_unknown_args = common_arg_parser.parse_known_args()
            trained_args.env = policy_env
            trained_args.seed = policy_seed
            trained_args.num_timesteps = policy_num_timesteps
            trained_args.run_num = policy_run_num
            trained_this_run_dir = get_dir_path_for_this_run(trained_args)
            trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir)
            trained_save_dir = get_save_dir(trained_this_run_dir)

            trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final")
            trained_final_params = pd.read_csv(trained_final_file, header=None).values[0]

            trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed)
            trained_model.set_pi_from_flat(trained_final_params)

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()


    this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num,
                                                    args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold,
                                                    result_dir=result_dir, network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)


    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    linear_top_vars_list_wanted_to_print = []
    if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None):
        # note this is only linear
        if linear_top_vars_list is None or linear_correlation_neuron_list is None:

            linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed,
                                               eval_run_num, additional_note, metric_param=metric_param)

        lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \
            get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold)



    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)
    with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp:
        json.dump(linear_top_vars_list_wanted_to_print, fp)
    with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp:
        json.dump(neurons_inds_to_include, fp)


    args.env = f'{experiment_label}_{entry_point}-v1'

    if not use_lagrangian:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model,
                    "neurons_inds_to_include": neurons_inds_to_include}
        )
    else:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None,
                    "neurons_inds_to_include": None}
        )

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = visualize
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = not visualize


    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy



    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    num_dof = walker_env.robot_skeleton.ndofs
    show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir)




    # extra run info I added for my purposes
    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    layers = [network_size, network_size]
    policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)
    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir