示例#1
0
    bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale)
    bc_agent.set_variance_with_data(out_scale)

    ts = timer.time()
    print("========================================")
    print("Running BC with expert demonstrations")
    print("========================================")
    bc_agent.train()
    print("========================================")
    print("BC training complete !!!")
    print("time taken = %f" % (timer.time() - ts))
    print("========================================")

    if job_data['eval_rollouts'] >= 1:
        score = e.evaluate_policy(policy,
                                  num_episodes=job_data['eval_rollouts'],
                                  mean_action=True)
        print("Score with behavior cloning = %f" % score[0][0])

if job_data['algorithm'] != 'DAPG':
    # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation
    demo_paths = None

# ===============================================================================
# RL Loop
# ===============================================================================

rl_agent = DAPG(e,
                policy,
                baseline,
                demo_paths,
示例#2
0
# Train BC
e = GymEnv('relocate-v0')
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
bc_agent = BC(demo_paths, policy=policy, epochs=5, batch_size=32, lr=1e-3)

ts = timer.time()
print("========================================")
print("Running BC with expert demonstrations")
print("========================================")
bc_agent.train()
print("========================================")
print("BC training complete !!!")
print("time taken = %f" % (timer.time() - ts))
print("========================================")

score = e.evaluate_policy(policy, num_episodes=10, mean_action=True)
print("Score with behavior cloning = %f" % score[0][0])

# ------------------------------
# Finetune with DAPG
print("========================================")
print("Finetuning with DAPG")
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=2,
                       learn_rate=1e-3)
agent = DAPG(e,
             policy,
             baseline,
             demo_paths=demo_paths,
def train(cfg, run_no, multiple_runs, seed):
    # ===============================================================================
    # Train Loop
    # ===============================================================================

    gpus_available = setup_gpus()
    env_name, job_name = parse_task(cfg)
    env = GymEnv(env_name, **cfg['env_kwargs'])
    policy = MLP(env.spec, hidden_sizes=tuple(cfg['policy_size']), seed=seed)
    baseline = MLPBaseline(env.spec,
                           reg_coef=1e-3,
                           batch_size=cfg['value_function']['batch_size'],
                           epochs=cfg['value_function']['epochs'],
                           learn_rate=cfg['value_function']['lr'],
                           use_gpu=False)

    # Get demonstration data if necessary and behavior clone
    print("========================================")
    print("Collecting expert demonstrations")
    print("========================================")
    demo_filename = cfg['demo_file']
    if cfg['demo_file'] != None:
        demo_paths = pickle.load(open(demo_filename, 'rb'))
    else:
        demo_paths = None

    if 'demo_file' in cfg['BC'] and cfg['BC']['demo_file'] != 'default':
        bc_demo_file_path = cfg['BC']['demo_file']
        if cfg['train']['use_timestamp']:
            bc_demo_file_path = bc_demo_file_path.replace(
                'v0', 'v0_timestamp_inserted')
        bc_demo_paths = pickle.load(open(bc_demo_file_path, 'rb'))
    else:
        bc_demo_paths = demo_paths
    if 'num_demo' in cfg and cfg['num_demo']:
        demo_paths = demo_paths[:cfg['num_demo']]
    if cfg['algorithm'] == 'DAPG_based_IRL':
        if 'get_paths_for_initialisation' in cfg['based_IRL']:
            if cfg['based_IRL']['get_paths_for_initialisation']:
                bc_demo_paths = add_dumped_paths_for_BC(demo_paths, cfg)

    ts = timer.time()
    if bc_demo_paths is not None and cfg['BC']['epochs'] > 0:
        print("========================================")
        print("Running BC with expert demonstrations")
        print("========================================")
        bc_agent = BC(bc_demo_paths[:25],
                      policy=policy,
                      epochs=cfg['BC']['epochs'],
                      batch_size=cfg['BC']['batch_size'],
                      lr=cfg['BC']['lr'],
                      loss_type='MSE',
                      set_transforms=True)

        bc_agent.train()
        print("========================================")
        print("BC training complete !!!")
        print("time taken = %f" % (timer.time() - ts))
        print("========================================")

    if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL':
        IRL_cfg = cfg
        if cfg['algorithm'] == 'DAPG_based_IRL':
            IRL_job_cfg_path = os.path.join("Runs",
                                            cfg['based_IRL']['IRL_job'],
                                            "config.yaml")
            IRL_cfg = yamlreader.yaml_load(IRL_job_cfg_path)

        irl_model = get_irl_model(env, demo_paths, IRL_cfg, seed)
        if cfg['algorithm'] == 'DAPG_based_IRL':
            full_irl_model_checkpoint_path = os.path.join(
                'Runs', cfg['based_IRL']['IRL_job'])
            if cfg['based_IRL']['IRL_run_no'] is not None:
                full_irl_model_checkpoint_path = os.path.join(
                    full_irl_model_checkpoint_path,
                    'run_' + str(cfg['based_IRL']['IRL_run_no']))
            if cfg['based_IRL']['IRL_step'] is not None:
                irl_model.load_iteration(
                    path=full_irl_model_checkpoint_path,
                    iteration=cfg['based_IRL']['IRL_step'])
            else:
                irl_model.load_last(path=full_irl_model_checkpoint_path)
            irl_model.eval(
                demo_paths
            )  # required to load model completely from the given path before changin to different path during training

    if cfg['eval_rollouts'] > 0:
        score = env.evaluate_policy(policy,
                                    num_episodes=cfg['eval_rollouts'],
                                    mean_action=True)
        print("Score with behavior cloning = %f" % score[0][0])

    if not cfg['use_DAPG']:
        # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation
        demo_paths = None

    # ===============================================================================
    # RL Loop
    # ===============================================================================

    irl_kwargs = None
    if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL':
        if cfg['algorithm'] == 'DAPG_based_IRL' or cfg['IRL'][
                'generator_alg'] == 'DAPG':
            generator_algorithm = DAPG
            generator_args = dict(
                demo_paths=demo_paths,
                normalized_step_size=cfg['RL']['step_size'],
                seed=seed,
                lam_0=cfg['RL']['lam_0'],
                lam_1=cfg['RL']['lam_1'],
                save_logs=cfg['save_logs'],
                augmentation=cfg['train']['augmentation'],
                entropy_weight=cfg['train']['entropy_weight'])
        elif cfg['IRL']['generator_alg'] == 'PPO':
            generator_algorithm = PPO
            generator_args = dict(
                demo_paths=demo_paths,
                epochs=cfg['PPO']['epochs'],
                mb_size=cfg['PPO']['batch_size'],
                target_kl_dist=cfg['PPO']['target_kl_dist'],
                seed=seed,
                lam_0=cfg['RL']['lam_0'],
                lam_1=cfg['RL']['lam_1'],
                save_logs=cfg['save_logs'],
                clip_coef=cfg['PPO']['clip_coef'],
                learn_rate=cfg['PPO']['lr'],
                augmentation=cfg['train']['augmentation'],
                entropy_weight=cfg['train']['entropy_weight'])
        else:
            raise ValueError("Generator algorithm name",
                             cfg['IRL']['generator_alg'], "not supported")
        irl_class = irl_training_class(generator_algorithm)
        rl_agent = irl_class(
            env,
            policy,
            baseline,
            train_irl=cfg['algorithm'] != 'DAPG_based_IRL',
            discr_lr=IRL_cfg['IRL']['discr']['lr'],
            irl_batch_size=IRL_cfg['IRL']['discr']['batch_size'],
            lower_lr_on_main_loop_percentage=IRL_cfg['IRL']['discr']
            ['lower_lr_on_main_loop_percentage'],
            irl_model=irl_model,
            **generator_args)
        irl_kwargs = dict(policy=dict(
            min_updates=1,
            max_updates=IRL_cfg['IRL']['max_gen_updates']
            if cfg['algorithm'] != 'DAPG_based_IRL' else 0,
            steps_till_max=IRL_cfg['IRL']['steps_till_max_gen_updates']))
    elif cfg['algorithm'] == 'DAPG':
        rl_agent = DAPG(env,
                        policy,
                        baseline,
                        demo_paths=demo_paths,
                        normalized_step_size=cfg['RL']['step_size'],
                        lam_0=cfg['RL']['lam_0'],
                        lam_1=cfg['RL']['lam_1'],
                        seed=seed,
                        save_logs=cfg['save_logs'],
                        augmentation=cfg['train']['augmentation'],
                        entropy_weight=cfg['train']['entropy_weight'])
    elif cfg['algorithm'] == 'PPO':
        rl_agent = PPO(env,
                       policy,
                       baseline,
                       demo_paths=demo_paths,
                       epochs=cfg['PPO']['epochs'],
                       mb_size=cfg['PPO']['batch_size'],
                       target_kl_dist=cfg['PPO']['target_kl_dist'],
                       seed=seed,
                       lam_0=cfg['RL']['lam_0'],
                       lam_1=cfg['RL']['lam_1'],
                       save_logs=cfg['save_logs'],
                       clip_coef=cfg['PPO']['clip_coef'],
                       learn_rate=cfg['PPO']['lr'],
                       augmentation=cfg['train']['augmentation'],
                       entropy_weight=cfg['train']['entropy_weight'])
    else:
        raise ValueError("Algorithm name", cfg['algorithm'], "not supported")

    # get IRL model kwargs if doing DAPG based on IRL
    env_kwargs = cfg['env_kwargs']
    if cfg['algorithm'] == 'DAPG_based_IRL':
        rl_agent.irl_model = irl_model

    # dump YAML config file
    job_path = os.path.join("Runs", job_name)
    if not os.path.isdir(job_path):
        os.makedirs(job_path)
    with open(os.path.join(job_path, 'config.yaml'), 'w') as f:
        dump(cfg, f)

    print("========================================")
    print("Starting reinforcement learning phase")
    print("========================================")

    ts = timer.time()
    train_agent(
        job_name=job_name,
        agent=rl_agent,
        seed=seed,
        niter=cfg['train']['steps'],
        gamma=cfg['train']['gamma'],
        gae_lambda=cfg['train']['gae_lambda'],
        num_cpu=cfg['num_cpu'],
        sample_mode='trajectories',
        num_traj=cfg['train']['num_traj'],
        save_freq=cfg['train']['save_freq'],
        evaluation_rollouts=cfg['eval_rollouts'],
        should_fresh_start=bool(cfg['IRL']['initialization_job'])
        if cfg['algorithm'] == 'IRL' else False,
        irl_kwargs=irl_kwargs,
        temperature_max=cfg['IRL']['temperature_max']
        if cfg['algorithm'] == 'IRL' else 0,
        temperature_min=cfg['IRL']['temperature_min']
        if cfg['algorithm'] == 'IRL' else 0,
        plot_keys=cfg['plot_keys'],
        run_no=run_no if multiple_runs else None,
        env_kwargs=env_kwargs,
        fixed_evaluation_init_states=cfg['fixed_evaluation_init_states'])
    print("time taken = %f" % (timer.time() - ts))
示例#4
0
文件: bc_test.py 项目: Divye02/mjrl-1
# ------------------------------
# Get demonstrations
print("========================================")
print("Collecting expert demonstrations")
print("========================================")
expert_pol = pickle.load(
    open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
demo_paths = sample_paths(N=5, policy=expert_pol, env_name=e.env_id)

# ------------------------------
# Train BC
policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED)
bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=16,
              lr=1e-4)  # will use Adam by default
ts = timer.time()
print("========================================")
print("Running BC with expert demonstrations")
print("========================================")
bc_agent.train()
print("========================================")
print("BC training complete !!!")
print("time taken = %f" % (timer.time() - ts))
print("========================================")

# ------------------------------
# Evaluate Policies
bc_pol_score = e.evaluate_policy(policy, num_episodes=25, mean_action=True)
expert_score = e.evaluate_policy(expert_pol, num_episodes=25, mean_action=True)
print("Expert policy performance (eval mode) = %f" % expert_score[0][0])
print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
示例#5
0
def experiment(variant):
    """
    This is a job script for running NPG/DAPG on hand tasks and other gym envs.
    Note that DAPG generalizes PG and BC init + PG finetuning.
    With appropriate settings of parameters, we can recover the full family.
    """
    import mj_envs

    job_data = default_job_data.copy()
    job_data.update(variant)

    env_params = ENV_PARAMS[variant['env_class']]
    job_data.update(env_params)

    assert 'algorithm' in job_data.keys()
    assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']])

    JOB_DIR = logger.get_snapshot_dir()

    # ===============================================================================
    # Train Loop
    # ===============================================================================

    seed = int(job_data['seedid'])

    e = GymEnv(job_data['env_id'])
    policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=seed)
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=job_data['vf_batch_size'],
                           epochs=job_data['vf_epochs'],
                           learn_rate=job_data['vf_learn_rate'])

    # Get demonstration data if necessary and behavior clone
    if job_data['algorithm'] != 'NPG':
        print("========================================")
        print("Collecting expert demonstrations")
        print("========================================")
        demo_paths = load_local_or_remote_file(job_data['demo_file'], 'rb')

        bc_agent = BC(demo_paths,
                      policy=policy,
                      epochs=job_data['bc_epochs'],
                      batch_size=job_data['bc_batch_size'],
                      lr=job_data['bc_learn_rate'],
                      loss_type='MSE',
                      set_transforms=False)
        in_shift, in_scale, out_shift, out_scale = bc_agent.compute_transformations(
        )
        bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale)
        bc_agent.set_variance_with_data(out_scale)

        ts = timer.time()
        print("========================================")
        print("Running BC with expert demonstrations")
        print("========================================")
        bc_agent.train()
        print("========================================")
        print("BC training complete !!!")
        print("time taken = %f" % (timer.time() - ts))
        print("========================================")

        if job_data['eval_rollouts'] >= 1:
            score = e.evaluate_policy(policy,
                                      num_episodes=job_data['eval_rollouts'],
                                      mean_action=True)
            print("Score with behavior cloning = %f" % score[0][0])

    if job_data['algorithm'] != 'DAPG':
        # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation
        demo_paths = None

    # ===============================================================================
    # RL Loop
    # ===============================================================================

    rl_agent = DAPG(e,
                    policy,
                    baseline,
                    demo_paths,
                    normalized_step_size=job_data['rl_step_size'],
                    lam_0=job_data['lam_0'],
                    lam_1=job_data['lam_1'],
                    seed=seed,
                    save_logs=True)

    print("========================================")
    print("Starting reinforcement learning phase")
    print("========================================")

    ts = timer.time()
    train_agent(job_name=JOB_DIR,
                agent=rl_agent,
                seed=seed,
                niter=job_data['rl_num_iter'],
                gamma=job_data['rl_gamma'],
                gae_lambda=job_data['rl_gae'],
                num_cpu=job_data['num_cpu'],
                sample_mode='trajectories',
                num_traj=job_data['rl_num_traj'],
                save_freq=job_data['save_freq'],
                evaluation_rollouts=job_data['eval_rollouts'])
    print("time taken = %f" % (timer.time() - ts))