bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale) bc_agent.set_variance_with_data(out_scale) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if job_data['eval_rollouts'] >= 1: score = e.evaluate_policy(policy, num_episodes=job_data['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if job_data['algorithm'] != 'DAPG': # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== rl_agent = DAPG(e, policy, baseline, demo_paths,
# Train BC e = GymEnv('relocate-v0') policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) bc_agent = BC(demo_paths, policy=policy, epochs=5, batch_size=32, lr=1e-3) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") score = e.evaluate_policy(policy, num_episodes=10, mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) # ------------------------------ # Finetune with DAPG print("========================================") print("Finetuning with DAPG") baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = DAPG(e, policy, baseline, demo_paths=demo_paths,
def train(cfg, run_no, multiple_runs, seed): # =============================================================================== # Train Loop # =============================================================================== gpus_available = setup_gpus() env_name, job_name = parse_task(cfg) env = GymEnv(env_name, **cfg['env_kwargs']) policy = MLP(env.spec, hidden_sizes=tuple(cfg['policy_size']), seed=seed) baseline = MLPBaseline(env.spec, reg_coef=1e-3, batch_size=cfg['value_function']['batch_size'], epochs=cfg['value_function']['epochs'], learn_rate=cfg['value_function']['lr'], use_gpu=False) # Get demonstration data if necessary and behavior clone print("========================================") print("Collecting expert demonstrations") print("========================================") demo_filename = cfg['demo_file'] if cfg['demo_file'] != None: demo_paths = pickle.load(open(demo_filename, 'rb')) else: demo_paths = None if 'demo_file' in cfg['BC'] and cfg['BC']['demo_file'] != 'default': bc_demo_file_path = cfg['BC']['demo_file'] if cfg['train']['use_timestamp']: bc_demo_file_path = bc_demo_file_path.replace( 'v0', 'v0_timestamp_inserted') bc_demo_paths = pickle.load(open(bc_demo_file_path, 'rb')) else: bc_demo_paths = demo_paths if 'num_demo' in cfg and cfg['num_demo']: demo_paths = demo_paths[:cfg['num_demo']] if cfg['algorithm'] == 'DAPG_based_IRL': if 'get_paths_for_initialisation' in cfg['based_IRL']: if cfg['based_IRL']['get_paths_for_initialisation']: bc_demo_paths = add_dumped_paths_for_BC(demo_paths, cfg) ts = timer.time() if bc_demo_paths is not None and cfg['BC']['epochs'] > 0: print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent = BC(bc_demo_paths[:25], policy=policy, epochs=cfg['BC']['epochs'], batch_size=cfg['BC']['batch_size'], lr=cfg['BC']['lr'], loss_type='MSE', set_transforms=True) bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': IRL_cfg = cfg if cfg['algorithm'] == 'DAPG_based_IRL': IRL_job_cfg_path = os.path.join("Runs", cfg['based_IRL']['IRL_job'], "config.yaml") IRL_cfg = yamlreader.yaml_load(IRL_job_cfg_path) irl_model = get_irl_model(env, demo_paths, IRL_cfg, seed) if cfg['algorithm'] == 'DAPG_based_IRL': full_irl_model_checkpoint_path = os.path.join( 'Runs', cfg['based_IRL']['IRL_job']) if cfg['based_IRL']['IRL_run_no'] is not None: full_irl_model_checkpoint_path = os.path.join( full_irl_model_checkpoint_path, 'run_' + str(cfg['based_IRL']['IRL_run_no'])) if cfg['based_IRL']['IRL_step'] is not None: irl_model.load_iteration( path=full_irl_model_checkpoint_path, iteration=cfg['based_IRL']['IRL_step']) else: irl_model.load_last(path=full_irl_model_checkpoint_path) irl_model.eval( demo_paths ) # required to load model completely from the given path before changin to different path during training if cfg['eval_rollouts'] > 0: score = env.evaluate_policy(policy, num_episodes=cfg['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if not cfg['use_DAPG']: # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== irl_kwargs = None if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': if cfg['algorithm'] == 'DAPG_based_IRL' or cfg['IRL'][ 'generator_alg'] == 'DAPG': generator_algorithm = DAPG generator_args = dict( demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['IRL']['generator_alg'] == 'PPO': generator_algorithm = PPO generator_args = dict( demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Generator algorithm name", cfg['IRL']['generator_alg'], "not supported") irl_class = irl_training_class(generator_algorithm) rl_agent = irl_class( env, policy, baseline, train_irl=cfg['algorithm'] != 'DAPG_based_IRL', discr_lr=IRL_cfg['IRL']['discr']['lr'], irl_batch_size=IRL_cfg['IRL']['discr']['batch_size'], lower_lr_on_main_loop_percentage=IRL_cfg['IRL']['discr'] ['lower_lr_on_main_loop_percentage'], irl_model=irl_model, **generator_args) irl_kwargs = dict(policy=dict( min_updates=1, max_updates=IRL_cfg['IRL']['max_gen_updates'] if cfg['algorithm'] != 'DAPG_based_IRL' else 0, steps_till_max=IRL_cfg['IRL']['steps_till_max_gen_updates'])) elif cfg['algorithm'] == 'DAPG': rl_agent = DAPG(env, policy, baseline, demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], seed=seed, save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['algorithm'] == 'PPO': rl_agent = PPO(env, policy, baseline, demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Algorithm name", cfg['algorithm'], "not supported") # get IRL model kwargs if doing DAPG based on IRL env_kwargs = cfg['env_kwargs'] if cfg['algorithm'] == 'DAPG_based_IRL': rl_agent.irl_model = irl_model # dump YAML config file job_path = os.path.join("Runs", job_name) if not os.path.isdir(job_path): os.makedirs(job_path) with open(os.path.join(job_path, 'config.yaml'), 'w') as f: dump(cfg, f) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent( job_name=job_name, agent=rl_agent, seed=seed, niter=cfg['train']['steps'], gamma=cfg['train']['gamma'], gae_lambda=cfg['train']['gae_lambda'], num_cpu=cfg['num_cpu'], sample_mode='trajectories', num_traj=cfg['train']['num_traj'], save_freq=cfg['train']['save_freq'], evaluation_rollouts=cfg['eval_rollouts'], should_fresh_start=bool(cfg['IRL']['initialization_job']) if cfg['algorithm'] == 'IRL' else False, irl_kwargs=irl_kwargs, temperature_max=cfg['IRL']['temperature_max'] if cfg['algorithm'] == 'IRL' else 0, temperature_min=cfg['IRL']['temperature_min'] if cfg['algorithm'] == 'IRL' else 0, plot_keys=cfg['plot_keys'], run_no=run_no if multiple_runs else None, env_kwargs=env_kwargs, fixed_evaluation_init_states=cfg['fixed_evaluation_init_states']) print("time taken = %f" % (timer.time() - ts))
# ------------------------------ # Get demonstrations print("========================================") print("Collecting expert demonstrations") print("========================================") expert_pol = pickle.load( open('swimmer_exp1/iterations/best_policy.pickle', 'rb')) demo_paths = sample_paths(N=5, policy=expert_pol, env_name=e.env_id) # ------------------------------ # Train BC policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=16, lr=1e-4) # will use Adam by default ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") # ------------------------------ # Evaluate Policies bc_pol_score = e.evaluate_policy(policy, num_episodes=25, mean_action=True) expert_score = e.evaluate_policy(expert_pol, num_episodes=25, mean_action=True) print("Expert policy performance (eval mode) = %f" % expert_score[0][0]) print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
def experiment(variant): """ This is a job script for running NPG/DAPG on hand tasks and other gym envs. Note that DAPG generalizes PG and BC init + PG finetuning. With appropriate settings of parameters, we can recover the full family. """ import mj_envs job_data = default_job_data.copy() job_data.update(variant) env_params = ENV_PARAMS[variant['env_class']] job_data.update(env_params) assert 'algorithm' in job_data.keys() assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']]) JOB_DIR = logger.get_snapshot_dir() # =============================================================================== # Train Loop # =============================================================================== seed = int(job_data['seedid']) e = GymEnv(job_data['env_id']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=seed) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = load_local_or_remote_file(job_data['demo_file'], 'rb') bc_agent = BC(demo_paths, policy=policy, epochs=job_data['bc_epochs'], batch_size=job_data['bc_batch_size'], lr=job_data['bc_learn_rate'], loss_type='MSE', set_transforms=False) in_shift, in_scale, out_shift, out_scale = bc_agent.compute_transformations( ) bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale) bc_agent.set_variance_with_data(out_scale) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if job_data['eval_rollouts'] >= 1: score = e.evaluate_policy(policy, num_episodes=job_data['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if job_data['algorithm'] != 'DAPG': # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== rl_agent = DAPG(e, policy, baseline, demo_paths, normalized_step_size=job_data['rl_step_size'], lam_0=job_data['lam_0'], lam_1=job_data['lam_1'], seed=seed, save_logs=True) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent(job_name=JOB_DIR, agent=rl_agent, seed=seed, niter=job_data['rl_num_iter'], gamma=job_data['rl_gamma'], gae_lambda=job_data['rl_gae'], num_cpu=job_data['num_cpu'], sample_mode='trajectories', num_traj=job_data['rl_num_traj'], save_freq=job_data['save_freq'], evaluation_rollouts=job_data['eval_rollouts']) print("time taken = %f" % (timer.time() - ts))