from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_linear import LinearPolicy from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import time as timer SEED = 500 e = GymEnv('Walker2d-v2') policy = LinearPolicy(e.spec, seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='walker_nominal', agent=agent, seed=SEED, niter=500, gamma=0.995, gae_lambda=0.97, num_cpu=4,
for i in range(num_seeds): np.random.seed(SEED) torch.manual_seed(SEED) job_name_stl_seed = job_name_stl + '/seed_{}'.format(i) e = {} baseline_stl = {} policy_stl = {} agent_stl = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] baseline_stl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True) policy_stl[task_id] = LinearPolicy(e[task_id].spec, seed=SEED) agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.1, seed=SEED, save_logs=True) loggers_stl = {} grads_stl = {} hess_stl = {} for task_id in range(num_tasks): ts = timer.time() train_agent(job_name=job_name_stl_seed, agent=agent_stl[task_id], seed=SEED, niter=200, gamma=0.995, gae_lambda=0.97, num_cpu=num_cpu,
np.random.seed(SEED) torch.manual_seed(SEED) job_name_ewc_seed = job_name_ewc + '/seed_{}'.format(i) e = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] for ewc_lambda in lambda_range: baseline_ewc = {} for task_id in range(num_tasks): baseline_ewc[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True) policy_ewc = LinearPolicy(e[0].spec, seed=SEED) agent_ewc = NPGEWC(e, policy_ewc, baseline_ewc, ewc_lambda=ewc_lambda, scaled_lambda=False, normalized_step_size=0.1, seed=SEED, save_logs=True) # agent = BatchREINFORCE(e, policy, baseline, learn_rate=0.0001, seed=SEED, save_logs=True) job_name_ewc_seed_lambda = job_name_ewc_seed + '/lambda{}'.format(ewc_lambda) for task_id in range(num_tasks): ts = timer.time() train_agent(job_name=job_name_ewc_seed_lambda, agent=agent_ewc, seed=SEED, niter=50, gamma=0.995, gae_lambda=0.97, num_cpu=num_cpu, sample_mode='trajectories',
def launch_job(tag, variant): print(len(variant)) seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \ cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp, \ shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant beta1, beta2 = betas iters = int(total_samples / batch_size) # NN policy # ================================== e = GymEnv(env) if use_nn_policy: policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed) else: policy = LinearPolicy(e.spec, seed=seed) vfn_batch_size = 256 if gn_vfn_opt else 64 vfn_epochs = 2 if gn_vfn_opt else 2 # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=vfn_batch_size, epochs=2, learn_rate=1e-3, use_gauss_newton=gn_vfn_opt) # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True) common_kwargs = dict(lr=lr, curv_type=curv_type, cg_iters=cg_iters, cg_residual_tol=cg_residual_tol, cg_prev_init_coef=cg_prev_init_coef, cg_precondition_empirical=cg_precondition_empirical, cg_precondition_regu_coef=cg_precondition_regu_coef, cg_precondition_exp=cg_precondition_exp, shrinkage_method=shrinkage_method, lanczos_amortization=lanczos_amortization, lanczos_iters=lanczos_iters, batch_size=batch_size) if optim == 'ngd': optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs) elif optim == 'natural_adam': optimizer = fisher_optim.NaturalAdam( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_adagrad': optimizer = fisher_optim.NaturalAdagrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) elif optim == 'natural_amsgrad': optimizer = fisher_optim.NaturalAmsgrad( policy.trainable_params, **common_kwargs, betas=(beta1, beta2), assume_locally_linear=approx_adaptive) if algo == 'trpo': from mjrl.algos.trpo_delta import TRPO agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True) # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True) else: from mjrl.algos.npg_cg_delta import NPG agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True) save_dir = build_log_dir(tag, variant) try: os.makedirs(save_dir) except: pass # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000)) train_agent(job_name=save_dir, agent=agent, seed=seed, niter=iters, gamma=0.995, gae_lambda=0.97, num_cpu=1, sample_mode='samples', num_samples=batch_size, save_freq=5, evaluation_rollouts=5, verbose=False) #True)