示例#1
0
def train_expert_policy(config):
    print('-' * 80)
    previous_dir = os.getcwd()
    ensure_dir(GEN_DATA_DIR)
    os.chdir(GEN_DATA_DIR)

    print('Training Expert')
    e = make_gym_env(config['env_id'], config)
    policy = MLP(e.spec, hidden_sizes=(32, 32), seed=config['seed'])
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=64,
                           epochs=2,
                           learn_rate=1e-3)
    agent = NPG(e,
                policy,
                baseline,
                normalized_step_size=0.1,
                seed=config['seed'],
                save_logs=True)

    job_name = '%s_expert' % config['env_name']
    # Need to change where it dumps the policy
    train_agent(job_name=job_name,
                agent=agent,
                seed=config['seed'],
                niter=30,
                gamma=0.995,
                gae_lambda=0.97,
                num_cpu=1,
                sample_mode='trajectories',
                num_traj=200,
                save_freq=5,
                evaluation_rollouts=5)
    os.chdir(previous_dir)
    os.rename(
        os.path.join(GEN_DATA_DIR, job_name, 'iterations/best_policy.pickle'),
        os.path.join(EXPERT_POLICIES_DIR, EXPERT_POLICIES[config['env_name']]))
    print('-' * 80)
示例#2
0
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import time as timer

SEED = 500

e = GymEnv('Walker2d-v2')
policy = LinearPolicy(e.spec, seed=SEED)
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=64,
                       epochs=2,
                       learn_rate=1e-3)
agent = NPG(e,
            policy,
            baseline,
            normalized_step_size=0.1,
            seed=SEED,
            save_logs=True)

ts = timer.time()
train_agent(job_name='walker_nominal',
            agent=agent,
            seed=SEED,
            niter=500,
            gamma=0.995,
            gae_lambda=0.97,
            num_cpu=4,
            sample_mode='trajectories',
            num_traj=50,
            save_freq=5,
            evaluation_rollouts=5)
示例#3
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    os.chdir(cwd)
    dirpath = os.path.join(job['save_dir'], job['job_name'])
    os.makedirs(dirpath, exist_ok=True)

    # start job
    os.chdir(cwd)
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    e = GymEnv(job['env_name'])

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip
    job_data_file = open(dirpath + '/job_data.txt', 'w')
    pprint.pprint(job, stream=job_data_file)

    job_data_file.close()

    # Make policy (???vik: sizes are hard coded)
    if 'init_policy' in job:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print('log std values in loaded policy = ')
        print(params[-policy.m:])
        # NOTE: if the log std is too small
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        # params[-policy.m:] += 1.0
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(e.spec,
                     init_log_std=job['init_std'],
                     hidden_sizes=(32, 32),
                     seed=job['seed'])
    # Agent
    agent = NPG(e, policy, baseline, seed=job['seed'], \
        normalized_step_size=job['normalized_step_size'], \
        save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=dirpath,
        agent=agent,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job['num_traj'],
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
示例#4
0
    forward_transfer_results = {}
    for task_id in range(num_tasks):
        f = open(
            job_name_stl_seed + '/iterations/task_{}/'.format(task_id) +
            'policy_0.pickle', 'rb')
        policy_stl = pickle.load(f)
        f.close()
        f = open(
            job_name_stl_seed + '/iterations/task_{}/'.format(task_id) +
            'baseline_0.pickle', 'rb')
        baseline_stl = pickle.load(f)
        f.close()

        agent_stl = NPG(e[task_id],
                        policy_stl,
                        baseline_stl,
                        normalized_step_size=0.01,
                        seed=SEED,
                        save_logs=False)
        agent_stl.set_task(task_id)
        eval_paths = trajectory_sampler.sample_paths_parallel(
            N=10,
            policy=policy_stl,
            num_cpu=num_cpu,
            env_name=e[task_id].env_id,
            mode='evaluation',
            pegasus_seed=SEED)

        forward_transfer_results[task_id] = np.mean(
            [np.sum(path['rewards']) for path in eval_paths])

    result_file = open(job_name_stl_seed + '/start_results.txt', 'w')
示例#5
0
    grads_stl = pickle.load(f)
    f.close()
    f = open(job_name_stl_seed + '/trained_stl_hess.pickle', 'rb')
    hess_stl = pickle.load(f)
    f.close()
    f = open(job_name_stl_seed + '/task_order.pickle', 'rb')
    task_order = pickle.load(f)
    f.close()

    e = {}
    agent_stl = {}
    for task_id in range(num_tasks):
        e[task_id] = e_unshuffled[task_order[task_id]]
        agent_stl[task_id] = NPG(e[task_id],
                                 policy_stl[task_id],
                                 baseline_stl[task_id],
                                 normalized_step_size=0.01,
                                 seed=SEED,
                                 save_logs=True)
        agent_stl[task_id].grad = grads_stl[task_id]
        agent_stl[task_id].hess = hess_stl[task_id]

    k = 5
    n = policy_stl[0].n
    m = policy_stl[0].m
    d = (n + 1) * m
    A = np.zeros((d * k, d * k))
    b = np.zeros((d * k, 1))
    S = np.zeros((k, num_tasks))
    L = np.zeros((d, k))
    Theta = np.zeros((d, num_tasks))
    policy_mtl = LinearPolicyLPGFTW(e[0].spec, k=k, max_k=k, seed=SEED)
示例#6
0
for i in range(num_seeds):
      np.random.seed(SEED)
      torch.manual_seed(SEED)

      job_name_stl_seed = job_name_stl + '/seed_{}'.format(i)

      e = {}
      baseline_stl = {}
      policy_stl = {}
      agent_stl = {}
      task_order = np.random.permutation(num_tasks)
      for task_id in range(num_tasks):
            e[task_id] = e_unshuffled[task_order[task_id]]
            baseline_stl[task_id] = MLPBaseline(e[task_id].spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3, use_gpu=True)
            policy_stl[task_id] = LinearPolicy(e[task_id].spec, seed=SEED)
            agent_stl[task_id] = NPG(e[task_id], policy_stl[task_id], baseline_stl[task_id], normalized_step_size=0.1, seed=SEED, save_logs=True)

      loggers_stl = {}
      grads_stl = {}
      hess_stl = {}
      for task_id in range(num_tasks):
            ts = timer.time()

            train_agent(job_name=job_name_stl_seed,
                        agent=agent_stl[task_id],
                        seed=SEED,
                        niter=200,
                        gamma=0.995,  
                        gae_lambda=0.97,
                        num_cpu=num_cpu,
                        sample_mode='trajectories',
示例#7
0
def single_process(job):
    job_start_time = timer.time()

    # Allow process to parallelize things internally
    curr_proc = mp.current_process()
    curr_proc.daemon = False

    # Create a directory for the job results.
    job_dir = os.path.join(job['output_dir'])
    if not os.path.isdir(job_dir):
        os.mkdir(job_dir)

    # start job
    job_start_time = timer.time()
    print('Started New Job : ', job['job_name'], '=======================')
    print('Job specifications : \n', job)

    # Make Env
    env_name = job['env_name']
    # adept_envs.global_config.set_config(env_name, {
    #     'robot_params': job['robot'],
    #     **job.get('env_params', {}),
    # })
    e = GymEnv(env_name)

    # Make baseline
    baseline = MLPBaseline(e.spec)

    # save job details
    job['horizon'] = e.horizon
    job['ctrl_timestep'] = e.env.env.dt
    job['sim_timestep'] = e.env.env.model.opt.timestep
    # job['sim_skip'] = e.env.env.skip

    with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file:
        pprint.pprint(job, stream=job_data_file)

    if 'init_policy' in job:
        policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed'])
        loaded_policy = pickle.load(open(job['init_policy'], 'rb'))
        loaded_params = loaded_policy.get_param_values()
        print("log std values in loaded policy = ")
        print(loaded_params[-policy.m:])
        # NOTE: if the log std is too small 
        # (say <-2.0, it is problem dependent and intuition should be used)
        # then we need to bump it up so that it explores
        loaded_params[-policy.m:] += job['init_std']
        policy.set_param_values(loaded_params)
        del job['init_policy']

    else:
        policy = MLP(
            e.spec,
            init_log_std=job['init_std'],
            hidden_sizes=job['hidden_sizes'],
            # hidden_sizes=(32, 32),
            seed=job['seed'])

    # Agent
    agent = NPG(
        e,
        policy,
        baseline,
        seed=job['seed'],
        normalized_step_size=job['normalized_step_size'],
        save_logs=job['save_logs'],
        FIM_invert_args=job['FIM_invert_args'])

    # Train Agent
    train_agent(
        job_name=job['job_name'],
        agent=agent,
        # save_dir=job_dir,
        seed=job['seed'],
        niter=job['niter'],
        gamma=job['gamma'],
        gae_lambda=job['gae_lambda'],
        num_cpu=job['num_cpu'],
        sample_mode=job['sample_mode'],
        num_traj=job.get('num_traj'),
        num_samples=job.get('num_samples'),
        evaluation_rollouts=job['evaluation_rollouts'],
        save_freq=job['save_freq'],
        plot_keys={'stoc_pol_mean', 'stoc_pol_std'},
    )

    total_job_time = timer.time() - job_start_time
    print('Job', job['job_name'],
          'took %f seconds ==============' % total_job_time)
    return total_job_time
示例#8
0
             init_log_std=job_data['init_log_std'])
baseline = MLPBaseline(e.spec,
                       reg_coef=1e-3,
                       batch_size=job_data['vf_batch_size'],
                       hidden_sizes=job_data['vf_hidden_size'],
                       epochs=job_data['vf_epochs'],
                       learn_rate=job_data['vf_learn_rate'])

# Construct the algorithm
if job_data['algorithm'] == 'NPG':
    # Other hyperparameters (like number of CG steps) can be specified in config for pass through
    # or default hyperparameters will be used
    agent = NPG(e,
                policy,
                baseline,
                normalized_step_size=job_data['rl_step_size'],
                seed=job_data['seed'],
                save_logs=True,
                **job_data['alg_hyper_params'])

elif job_data['algorithm'] == 'VPG':
    agent = BatchREINFORCE(e,
                           policy,
                           baseline,
                           learn_rate=job_data['rl_step_size'],
                           seed=job_data['seed'],
                           save_logs=True,
                           **job_data['alg_hyper_params'])

elif job_data['algorithm'] == 'NVPG':
    agent = BatchREINFORCE(e,
示例#9
0
def launch_job(tag, variant):

    print(len(variant))
    seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \
        cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp,  \
        shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, gn_vfn_opt, total_samples = variant
    beta1, beta2 = betas

    iters = int(total_samples / batch_size)

    # NN policy
    # ==================================
    e = GymEnv(env)
    if use_nn_policy:
        policy = MLP(e.spec, hidden_sizes=(64, ), seed=seed)
    else:
        policy = LinearPolicy(e.spec, seed=seed)
    vfn_batch_size = 256 if gn_vfn_opt else 64
    vfn_epochs = 2 if gn_vfn_opt else 2
    # baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
    baseline = MLPBaseline(e.spec,
                           reg_coef=1e-3,
                           batch_size=vfn_batch_size,
                           epochs=2,
                           learn_rate=1e-3,
                           use_gauss_newton=gn_vfn_opt)
    # agent = NPG(e, policy, baseline, normalized_step_size=0.005, seed=SEED, save_logs=True)

    common_kwargs = dict(lr=lr,
                         curv_type=curv_type,
                         cg_iters=cg_iters,
                         cg_residual_tol=cg_residual_tol,
                         cg_prev_init_coef=cg_prev_init_coef,
                         cg_precondition_empirical=cg_precondition_empirical,
                         cg_precondition_regu_coef=cg_precondition_regu_coef,
                         cg_precondition_exp=cg_precondition_exp,
                         shrinkage_method=shrinkage_method,
                         lanczos_amortization=lanczos_amortization,
                         lanczos_iters=lanczos_iters,
                         batch_size=batch_size)

    if optim == 'ngd':
        optimizer = fisher_optim.NGD(policy.trainable_params, **common_kwargs)
    elif optim == 'natural_adam':
        optimizer = fisher_optim.NaturalAdam(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_adagrad':
        optimizer = fisher_optim.NaturalAdagrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)
    elif optim == 'natural_amsgrad':
        optimizer = fisher_optim.NaturalAmsgrad(
            policy.trainable_params,
            **common_kwargs,
            betas=(beta1, beta2),
            assume_locally_linear=approx_adaptive)

    if algo == 'trpo':
        from mjrl.algos.trpo_delta import TRPO
        agent = TRPO(e, policy, baseline, optimizer, seed=seed, save_logs=True)
        # agent = TRPO(e, policy, baseline, seed=seed, save_logs=True)
    else:
        from mjrl.algos.npg_cg_delta import NPG
        agent = NPG(e, policy, baseline, optimizer, seed=seed, save_logs=True)

    save_dir = build_log_dir(tag, variant)
    try:
        os.makedirs(save_dir)
    except:
        pass

    # print ("Iters:", iters, ", num_traj: ", str(batch_size//1000))
    train_agent(job_name=save_dir,
                agent=agent,
                seed=seed,
                niter=iters,
                gamma=0.995,
                gae_lambda=0.97,
                num_cpu=1,
                sample_mode='samples',
                num_samples=batch_size,
                save_freq=5,
                evaluation_rollouts=5,
                verbose=False)  #True)