def train_expert_policy(config): print('-' * 80) previous_dir = os.getcwd() ensure_dir(GEN_DATA_DIR) os.chdir(GEN_DATA_DIR) print('Training Expert') e = make_gym_env(config['env_id'], config) policy = MLP(e.spec, hidden_sizes=(32, 32), seed=config['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=config['seed'], save_logs=True) job_name = '%s_expert' % config['env_name'] # Need to change where it dumps the policy train_agent(job_name=job_name, agent=agent, seed=config['seed'], niter=30, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='trajectories', num_traj=200, save_freq=5, evaluation_rollouts=5) os.chdir(previous_dir) os.rename( os.path.join(GEN_DATA_DIR, job_name, 'iterations/best_policy.pickle'), os.path.join(EXPERT_POLICIES_DIR, EXPERT_POLICIES[config['env_name']])) print('-' * 80)
from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import time as timer SEED = 500 e = GymEnv('Walker2d-v2') policy = LinearPolicy(e.spec, seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='walker_nominal', agent=agent, seed=SEED, niter=500, gamma=0.995, gae_lambda=0.97, num_cpu=4, sample_mode='trajectories', num_traj=50, save_freq=5, evaluation_rollouts=5)
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False os.chdir(cwd) dirpath = os.path.join(job['save_dir'], job['job_name']) os.makedirs(dirpath, exist_ok=True) # start job os.chdir(cwd) job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env e = GymEnv(job['env_name']) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip job_data_file = open(dirpath + '/job_data.txt', 'w') pprint.pprint(job, stream=job_data_file) job_data_file.close() # Make policy (???vik: sizes are hard coded) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print('log std values in loaded policy = ') print(params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores # params[-policy.m:] += 1.0 policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG(e, policy, baseline, seed=job['seed'], \ normalized_step_size=job['normalized_step_size'], \ save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=dirpath, agent=agent, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job['num_traj'], evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
forward_transfer_results = {} for task_id in range(num_tasks): f = open( job_name_stl_seed + '/iterations/task_{}/'.format(task_id) + 'policy_0.pickle', 'rb') policy_stl = pickle.load(f) f.close() f = open( job_name_stl_seed + '/iterations/task_{}/'.format(task_id) + 'baseline_0.pickle', 'rb') baseline_stl = pickle.load(f) f.close() agent_stl = NPG(e[task_id], policy_stl, baseline_stl, normalized_step_size=0.01, seed=SEED, save_logs=False) agent_stl.set_task(task_id) eval_paths = trajectory_sampler.sample_paths_parallel( N=10, policy=policy_stl, num_cpu=num_cpu, env_name=e[task_id].env_id, mode='evaluation', pegasus_seed=SEED) forward_transfer_results[task_id] = np.mean( [np.sum(path['rewards']) for path in eval_paths]) result_file = open(job_name_stl_seed + '/start_results.txt', 'w')
grads_stl = pickle.load(f) f.close() f = open(job_name_stl_seed + '/trained_stl_hess.pickle', 'rb') hess_stl = pickle.load(f) f.close() f = open(job_name_stl_seed + '/task_order.pickle', 'rb') task_order = pickle.load(f) f.close() e = {} agent_stl = {} for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.01, seed=SEED, save_logs=True) agent_stl[task_id].grad = grads_stl[task_id] agent_stl[task_id].hess = hess_stl[task_id] k = 5 n = policy_stl[0].n m = policy_stl[0].m d = (n + 1) * m A = np.zeros((d * k, d * k)) b = np.zeros((d * k, 1)) S = np.zeros((k, num_tasks)) L = np.zeros((d, k)) Theta = np.zeros((d, num_tasks)) policy_mtl = LinearPolicyLPGFTW(e[0].spec, k=k, max_k=k, seed=SEED)
for i in range(num_seeds): np.random.seed(SEED) torch.manual_seed(SEED) job_name_stl_seed = job_name_stl + '/seed_{}'.format(i) e = {} baseline_stl = {} policy_stl = {} agent_stl = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] baseline_stl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True) policy_stl[task_id] = LinearPolicy(e[task_id].spec, seed=SEED) agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.1, seed=SEED, save_logs=True) loggers_stl = {} grads_stl = {} hess_stl = {} for task_id in range(num_tasks): ts = timer.time() train_agent(job_name=job_name_stl_seed, agent=agent_stl[task_id], seed=SEED, niter=200, gamma=0.995, gae_lambda=0.97, num_cpu=num_cpu, sample_mode='trajectories',
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False # Create a directory for the job results. job_dir = os.path.join(job['output_dir']) if not os.path.isdir(job_dir): os.mkdir(job_dir) # start job job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env env_name = job['env_name'] # adept_envs.global_config.set_config(env_name, { # 'robot_params': job['robot'], # **job.get('env_params', {}), # }) e = GymEnv(env_name) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file: pprint.pprint(job, stream=job_data_file) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print("log std values in loaded policy = ") print(loaded_params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores loaded_params[-policy.m:] += job['init_std'] policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP( e.spec, init_log_std=job['init_std'], hidden_sizes=job['hidden_sizes'], # hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG( e, policy, baseline, seed=job['seed'], normalized_step_size=job['normalized_step_size'], save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=job['job_name'], agent=agent, # save_dir=job_dir, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job.get('num_traj'), num_samples=job.get('num_samples'), evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
init_log_std=job_data['init_log_std']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Construct the algorithm if job_data['algorithm'] == 'NPG': # Other hyperparameters (like number of CG steps) can be specified in config for pass through # or default hyperparameters will be used agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'], seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) elif job_data['algorithm'] == 'VPG': agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'], seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) elif job_data['algorithm'] == 'NVPG': agent = BatchREINFORCE(e,
def launch_job(tag, variant): print(len(variant)) seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \ cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp, \ shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant beta1, beta2 = betas iters = int(total_samples / batch_size) # NN policy # ================================== e = GymEnv(env) if use_nn_policy: policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed) else: policy = LinearPolicy(e.spec, seed=seed) vfn_batch_size = 256 if gn_vfn_opt else 64 vfn_epochs = 2 if gn_vfn_opt else 2 # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=vfn_batch_size, epochs=2, learn_rate=1e-3, use_gauss_newton=gn_vfn_opt) # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True) common_kwargs = dict(lr=lr, curv_type=curv_type, cg_iters=cg_iters, cg_residual_tol=cg_residual_tol, cg_prev_init_coef=cg_prev_init_coef, cg_precondition_empirical=cg_precondition_empirical, cg_precondition_regu_coef=cg_precondition_regu_coef, cg_precondition_exp=cg_precondition_exp, shrinkage_method=shrinkage_method, lanczos_amortization=lanczos_amortization, lanczos_iters=lanczos_iters, batch_size=batch_size) if optim == 'ngd': optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs) elif optim == 'natural_adam': optimizer = fisher_optim.NaturalAdam( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_adagrad': optimizer = fisher_optim.NaturalAdagrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_amsgrad': optimizer = fisher_optim.NaturalAmsgrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) if algo == 'trpo': from mjrl.algos.trpo_delta import TRPO agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True) # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True) else: from mjrl.algos.npg_cg_delta import NPG agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True) save_dir = build_log_dir(tag, variant) try: os.makedirs(save_dir) except: pass # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000)) train_agent(job_name=save_dir, agent=agent, seed=seed, niter=iters, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='samples', num_samples=batch_size, save_freq=5, evaluation_rollouts=5, verbose=False) #True)