def train_expert_policy(config): print('-' * 80) previous_dir = os.getcwd() ensure_dir(GEN_DATA_DIR) os.chdir(GEN_DATA_DIR) print('Training Expert') e = make_gym_env(config['env_id'], config) policy = MLP(e.spec, hidden_sizes=(32, 32), seed=config['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=config['seed'], save_logs=True) job_name = '%s_expert' % config['env_name'] # Need to change where it dumps the policy train_agent(job_name=job_name, agent=agent, seed=config['seed'], niter=30, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='trajectories', num_traj=200, save_freq=5, evaluation_rollouts=5) os.chdir(previous_dir) os.rename( os.path.join(GEN_DATA_DIR, job_name, 'iterations/best_policy.pickle'), os.path.join(EXPERT_POLICIES_DIR, EXPERT_POLICIES[config['env_name']])) print('-' * 80)
) # only do the environment here, so different files can create the same tasks for i in range(num_seeds): np.random.seed(SEED) torch.manual_seed(SEED) job_name_lpgftw_seed = job_name_lpgftw + '/seed_{}'.format(i) e = {} baseline_mtl = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] baseline_mtl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True) policy_mtl = LinearPolicyLPGFTW(e[0].spec, k=1, max_k=5, seed=SEED) agent_mtl = NPGFTW(e, policy_mtl, baseline_mtl, normalized_step_size=1, seed=SEED, save_logs=True, new_col_mode='max_k') for task_id in range(num_tasks): ts = timer.time() train_agent(job_name=job_name_lpgftw_seed,
from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_linear import LinearPolicy from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import time as timer SEED = 500 e = GymEnv('Walker2d-v2') policy = LinearPolicy(e.spec, seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='walker_nominal', agent=agent, seed=SEED, niter=500, gamma=0.995, gae_lambda=0.97, num_cpu=4,
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False os.chdir(cwd) dirpath = os.path.join(job['save_dir'], job['job_name']) os.makedirs(dirpath, exist_ok=True) # start job os.chdir(cwd) job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env e = GymEnv(job['env_name']) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip job_data_file = open(dirpath + '/job_data.txt', 'w') pprint.pprint(job, stream=job_data_file) job_data_file.close() # Make policy (???vik: sizes are hard coded) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print('log std values in loaded policy = ') print(params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores # params[-policy.m:] += 1.0 policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG(e, policy, baseline, seed=job['seed'], \ normalized_step_size=job['normalized_step_size'], \ save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=dirpath, agent=agent, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job['num_traj'], evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
) else job_data['lam_1'] EXP_FILE = JOB_DIR + '/job_config.json' with open(EXP_FILE, 'w') as f: json.dump(job_data, f, indent=4) # =============================================================================== # Train Loop # =============================================================================== e = GymEnv(job_data['env']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = pickle.load(open(job_data['demo_file'], 'rb')) bc_agent = BC(demo_paths, policy=policy, epochs=job_data['bc_epochs'], batch_size=job_data['bc_batch_size'], lr=job_data['bc_learn_rate'],
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False # Create a directory for the job results. job_dir = os.path.join(job['output_dir']) if not os.path.isdir(job_dir): os.mkdir(job_dir) # start job job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env env_name = job['env_name'] # adept_envs.global_config.set_config(env_name, { # 'robot_params': job['robot'], # **job.get('env_params', {}), # }) e = GymEnv(env_name) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file: pprint.pprint(job, stream=job_data_file) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print("log std values in loaded policy = ") print(loaded_params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores loaded_params[-policy.m:] += job['init_std'] policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP( e.spec, init_log_std=job['init_std'], hidden_sizes=job['hidden_sizes'], # hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG( e, policy, baseline, seed=job['seed'], normalized_step_size=job['normalized_step_size'], save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=job['job_name'], agent=agent, # save_dir=job_dir, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job.get('num_traj'), num_samples=job.get('num_samples'), evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
e.set_seed(SEED) models = [ DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED + i, **job_data) for i in range(job_data['num_models']) ] policy = MLP(e.spec, seed=SEED, hidden_sizes=job_data['policy_size'], init_log_std=job_data['init_log_std'], min_log_std=-2.5) baseline = MLPBaseline( e.spec, reg_coef=1e-3, batch_size=256, epochs=2, learn_rate=1e-3, use_gpu=(True if job_data['device'] == 'cuda' else False)) # baseline = QuadraticBaseline(e.spec) agent = ModelAccelNPG( fitted_model=models, env=e, policy=policy, baseline=baseline, seed=SEED, # hvp_sample_frac=job_data['hvp_frac'], normalized_step_size=job_data['step_size'], save_logs=True) paths = []
def launch_job(tag, variant): print(len(variant)) seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \ cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp, \ shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant beta1, beta2 = betas iters = int(total_samples / batch_size) # NN policy # ================================== e = GymEnv(env) if use_nn_policy: policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed) else: policy = LinearPolicy(e.spec, seed=seed) vfn_batch_size = 256 if gn_vfn_opt else 64 vfn_epochs = 2 if gn_vfn_opt else 2 # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=vfn_batch_size, epochs=2, learn_rate=1e-3, use_gauss_newton=gn_vfn_opt) # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True) common_kwargs = dict(lr=lr, curv_type=curv_type, cg_iters=cg_iters, cg_residual_tol=cg_residual_tol, cg_prev_init_coef=cg_prev_init_coef, cg_precondition_empirical=cg_precondition_empirical, cg_precondition_regu_coef=cg_precondition_regu_coef, cg_precondition_exp=cg_precondition_exp, shrinkage_method=shrinkage_method, lanczos_amortization=lanczos_amortization, lanczos_iters=lanczos_iters, batch_size=batch_size) if optim == 'ngd': optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs) elif optim == 'natural_adam': optimizer = fisher_optim.NaturalAdam( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_adagrad': optimizer = fisher_optim.NaturalAdagrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_amsgrad': optimizer = fisher_optim.NaturalAmsgrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) if algo == 'trpo': from mjrl.algos.trpo_delta import TRPO agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True) # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True) else: from mjrl.algos.npg_cg_delta import NPG agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True) save_dir = build_log_dir(tag, variant) try: os.makedirs(save_dir) except: pass # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000)) train_agent(job_name=save_dir, agent=agent, seed=seed, niter=iters, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='samples', num_samples=batch_size, save_freq=5, evaluation_rollouts=5, verbose=False) #True)
def train(cfg, run_no, multiple_runs, seed): # =============================================================================== # Train Loop # =============================================================================== gpus_available = setup_gpus() env_name, job_name = parse_task(cfg) env = GymEnv(env_name, **cfg['env_kwargs']) policy = MLP(env.spec, hidden_sizes=tuple(cfg['policy_size']), seed=seed) baseline = MLPBaseline(env.spec, reg_coef=1e-3, batch_size=cfg['value_function']['batch_size'], epochs=cfg['value_function']['epochs'], learn_rate=cfg['value_function']['lr'], use_gpu=False) # Get demonstration data if necessary and behavior clone print("========================================") print("Collecting expert demonstrations") print("========================================") demo_filename = cfg['demo_file'] if cfg['demo_file'] != None: demo_paths = pickle.load(open(demo_filename, 'rb')) else: demo_paths = None if 'demo_file' in cfg['BC'] and cfg['BC']['demo_file'] != 'default': bc_demo_file_path = cfg['BC']['demo_file'] if cfg['train']['use_timestamp']: bc_demo_file_path = bc_demo_file_path.replace( 'v0', 'v0_timestamp_inserted') bc_demo_paths = pickle.load(open(bc_demo_file_path, 'rb')) else: bc_demo_paths = demo_paths if 'num_demo' in cfg and cfg['num_demo']: demo_paths = demo_paths[:cfg['num_demo']] if cfg['algorithm'] == 'DAPG_based_IRL': if 'get_paths_for_initialisation' in cfg['based_IRL']: if cfg['based_IRL']['get_paths_for_initialisation']: bc_demo_paths = add_dumped_paths_for_BC(demo_paths, cfg) ts = timer.time() if bc_demo_paths is not None and cfg['BC']['epochs'] > 0: print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent = BC(bc_demo_paths[:25], policy=policy, epochs=cfg['BC']['epochs'], batch_size=cfg['BC']['batch_size'], lr=cfg['BC']['lr'], loss_type='MSE', set_transforms=True) bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': IRL_cfg = cfg if cfg['algorithm'] == 'DAPG_based_IRL': IRL_job_cfg_path = os.path.join("Runs", cfg['based_IRL']['IRL_job'], "config.yaml") IRL_cfg = yamlreader.yaml_load(IRL_job_cfg_path) irl_model = get_irl_model(env, demo_paths, IRL_cfg, seed) if cfg['algorithm'] == 'DAPG_based_IRL': full_irl_model_checkpoint_path = os.path.join( 'Runs', cfg['based_IRL']['IRL_job']) if cfg['based_IRL']['IRL_run_no'] is not None: full_irl_model_checkpoint_path = os.path.join( full_irl_model_checkpoint_path, 'run_' + str(cfg['based_IRL']['IRL_run_no'])) if cfg['based_IRL']['IRL_step'] is not None: irl_model.load_iteration( path=full_irl_model_checkpoint_path, iteration=cfg['based_IRL']['IRL_step']) else: irl_model.load_last(path=full_irl_model_checkpoint_path) irl_model.eval( demo_paths ) # required to load model completely from the given path before changin to different path during training if cfg['eval_rollouts'] > 0: score = env.evaluate_policy(policy, num_episodes=cfg['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if not cfg['use_DAPG']: # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== irl_kwargs = None if cfg['algorithm'] == 'IRL' or cfg['algorithm'] == 'DAPG_based_IRL': if cfg['algorithm'] == 'DAPG_based_IRL' or cfg['IRL'][ 'generator_alg'] == 'DAPG': generator_algorithm = DAPG generator_args = dict( demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['IRL']['generator_alg'] == 'PPO': generator_algorithm = PPO generator_args = dict( demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Generator algorithm name", cfg['IRL']['generator_alg'], "not supported") irl_class = irl_training_class(generator_algorithm) rl_agent = irl_class( env, policy, baseline, train_irl=cfg['algorithm'] != 'DAPG_based_IRL', discr_lr=IRL_cfg['IRL']['discr']['lr'], irl_batch_size=IRL_cfg['IRL']['discr']['batch_size'], lower_lr_on_main_loop_percentage=IRL_cfg['IRL']['discr'] ['lower_lr_on_main_loop_percentage'], irl_model=irl_model, **generator_args) irl_kwargs = dict(policy=dict( min_updates=1, max_updates=IRL_cfg['IRL']['max_gen_updates'] if cfg['algorithm'] != 'DAPG_based_IRL' else 0, steps_till_max=IRL_cfg['IRL']['steps_till_max_gen_updates'])) elif cfg['algorithm'] == 'DAPG': rl_agent = DAPG(env, policy, baseline, demo_paths=demo_paths, normalized_step_size=cfg['RL']['step_size'], lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], seed=seed, save_logs=cfg['save_logs'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) elif cfg['algorithm'] == 'PPO': rl_agent = PPO(env, policy, baseline, demo_paths=demo_paths, epochs=cfg['PPO']['epochs'], mb_size=cfg['PPO']['batch_size'], target_kl_dist=cfg['PPO']['target_kl_dist'], seed=seed, lam_0=cfg['RL']['lam_0'], lam_1=cfg['RL']['lam_1'], save_logs=cfg['save_logs'], clip_coef=cfg['PPO']['clip_coef'], learn_rate=cfg['PPO']['lr'], augmentation=cfg['train']['augmentation'], entropy_weight=cfg['train']['entropy_weight']) else: raise ValueError("Algorithm name", cfg['algorithm'], "not supported") # get IRL model kwargs if doing DAPG based on IRL env_kwargs = cfg['env_kwargs'] if cfg['algorithm'] == 'DAPG_based_IRL': rl_agent.irl_model = irl_model # dump YAML config file job_path = os.path.join("Runs", job_name) if not os.path.isdir(job_path): os.makedirs(job_path) with open(os.path.join(job_path, 'config.yaml'), 'w') as f: dump(cfg, f) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent( job_name=job_name, agent=rl_agent, seed=seed, niter=cfg['train']['steps'], gamma=cfg['train']['gamma'], gae_lambda=cfg['train']['gae_lambda'], num_cpu=cfg['num_cpu'], sample_mode='trajectories', num_traj=cfg['train']['num_traj'], save_freq=cfg['train']['save_freq'], evaluation_rollouts=cfg['eval_rollouts'], should_fresh_start=bool(cfg['IRL']['initialization_job']) if cfg['algorithm'] == 'IRL' else False, irl_kwargs=irl_kwargs, temperature_max=cfg['IRL']['temperature_max'] if cfg['algorithm'] == 'IRL' else 0, temperature_min=cfg['IRL']['temperature_min'] if cfg['algorithm'] == 'IRL' else 0, plot_keys=cfg['plot_keys'], run_no=run_no if multiple_runs else None, env_kwargs=env_kwargs, fixed_evaluation_init_states=cfg['fixed_evaluation_init_states']) print("time taken = %f" % (timer.time() - ts))
def experiment(variant): """ This is a job script for running NPG/DAPG on hand tasks and other gym envs. Note that DAPG generalizes PG and BC init + PG finetuning. With appropriate settings of parameters, we can recover the full family. """ import mj_envs job_data = default_job_data.copy() job_data.update(variant) env_params = ENV_PARAMS[variant['env_class']] job_data.update(env_params) assert 'algorithm' in job_data.keys() assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']]) JOB_DIR = logger.get_snapshot_dir() # =============================================================================== # Train Loop # =============================================================================== seed = int(job_data['seedid']) e = GymEnv(job_data['env_id']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=seed) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = load_local_or_remote_file(job_data['demo_file'], 'rb') bc_agent = BC(demo_paths, policy=policy, epochs=job_data['bc_epochs'], batch_size=job_data['bc_batch_size'], lr=job_data['bc_learn_rate'], loss_type='MSE', set_transforms=False) in_shift, in_scale, out_shift, out_scale = bc_agent.compute_transformations( ) bc_agent.set_transformations(in_shift, in_scale, out_shift, out_scale) bc_agent.set_variance_with_data(out_scale) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") if job_data['eval_rollouts'] >= 1: score = e.evaluate_policy(policy, num_episodes=job_data['eval_rollouts'], mean_action=True) print("Score with behavior cloning = %f" % score[0][0]) if job_data['algorithm'] != 'DAPG': # We throw away the demo data when training from scratch or fine-tuning with RL without explicit augmentation demo_paths = None # =============================================================================== # RL Loop # =============================================================================== rl_agent = DAPG(e, policy, baseline, demo_paths, normalized_step_size=job_data['rl_step_size'], lam_0=job_data['lam_0'], lam_1=job_data['lam_1'], seed=seed, save_logs=True) print("========================================") print("Starting reinforcement learning phase") print("========================================") ts = timer.time() train_agent(job_name=JOB_DIR, agent=rl_agent, seed=seed, niter=job_data['rl_num_iter'], gamma=job_data['rl_gamma'], gae_lambda=job_data['rl_gae'], num_cpu=job_data['num_cpu'], sample_mode='trajectories', num_traj=job_data['rl_num_traj'], save_freq=job_data['save_freq'], evaluation_rollouts=job_data['eval_rollouts']) print("time taken = %f" % (timer.time() - ts))