Exemplo n.º 1
0
        hidden_sizes=args.hidden_sizes,
        hidden_nonlinearity=nonlin)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=args.batch_size,
    max_path_length=args.max_traj_len,
    n_itr=args.n_itr,
    discount=0.99,
    step_size=0.01,
    #force_batch_sampler=True,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        base_eps=1e-5)))

# use date and time to create new logging directory for each run
date = calendar.datetime.date.today().strftime('%y-%m-%d')
if date not in os.listdir(model_path):
    os.mkdir(model_path + '/' + date)

c = 0
exp_name = args.environment + '-' + str(c)

while exp_name in os.listdir(model_path + '/' + date + '/'):
    c += 1
    exp_name = args.environment + '-' + str(c)

exp_dir = date + '/' + exp_name
log_dir = osp.join(config.LOG_DIR, exp_dir)
Exemplo n.º 2
0
            decay_rate=args.decay_rate,
            decay_steps=args.decay_steps,
            act_mean=initial_act_mean,
            act_std=initial_act_std,
            freeze_upper=args.freeze_upper,
            freeze_lower=args.freeze_lower,
            fo_optimizer_cls=tf.train.AdamOptimizer,
            load_params_args=None,
            temporal_indices=temporal_indices,
            temporal_noise_thresh=args.temporal_noise_thresh,
            fo_optimizer_args=dict(learning_rate=args.adam_lr,
                                   beta1=args.adam_beta1,
                                   beta2=args.adam_beta2,
                                   epsilon=args.adam_epsilon),
            optimizer=ConjugateGradientOptimizer(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

# Load checkpoint if desired
if len(args.ckpt_name) > 0:
    policy.load_params(args.ckpt_name, args.ckpt_itr, [])
    baseline.load_params(args.ckpt_name, args.ckpt_itr, [])
    reward.load_params(args.ckpt_name, args.ckpt_itr, [])

# use date and time to create new logging directory for each run
date = calendar.datetime.date.today().strftime('%y-%m-%d')
if date not in os.listdir(model_path):
    os.mkdir(model_path + '/' + date)

c = 0
exp_name = args.exp_name + '-' + str(c)
Exemplo n.º 3
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # create the environment
    env = _create_env(args)

    # create expert data
    expert_data_T, expert_data_V = _create_expert_data(args)
    expert_data = dict(
            train = expert_data_T,
            valid = expert_data_V
            )

    # create policy
    policy, init_ops = _create_policy(args, env)

    # create auxiliary networks (invdyn, reward, variational posterior)
    invdyn_model, reward_model, info_model, env = _create_aux_networks(args, env)

    # create baseline
    if args.baseline_type == "linear":
        baseline = LinearFeatureBaseline(env_spec=None)
    else:
        assert False

    # use date and time to create new logging directory for each run
    date= calendar.datetime.date.today().strftime('%y-%m-%d')
    if date not in os.listdir(model_path):
        os.mkdir(model_path+'/'+date)

    c = 0
    exp_name = '{}-'.format(args.exp_name) + str(c)

    while exp_name in os.listdir(model_path+'/'+date+'/'):
        c += 1
        exp_name = '{}-'.format(args.exp_name)+str(c)

    exp_dir = date+'/'+exp_name
    log_dir = osp.join(config.LOG_DIR, exp_dir)

    policy.set_log_dir(log_dir)
    if info_model is not None:
        info_model.set_log_dir(log_dir)

    _create_log(args)

    # run GAIL algorithm
    models = {"policy":policy, "info":info_model, "reward":reward_model}
    bpo_args = dict(
        n_itr=args.n_itr,
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.trpo_batch_size,
        max_path_length=args.max_path_length,
        discount=args.discount,
        step_size=args.trpo_step_size,
        force_batch_sampler=True,
        whole_paths=True,
        init_ops=init_ops,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
        save_models=[models[model_name] for model_name in args.save_models]
        )
    vae_args = dict(
            kl_weight=args.kl_weight,
            )
    curriculum = dict(
            start = args.curr_start,
            add = args.curr_add,
            step = args.curr_step
            )
    if not args.model_all : curriculum = {}
    kwargs = {k:v for k, v in bpo_args.items() + vae_args.items()}
    algo = GAIL(
                args.exp_name,
                exp_name,
                expert_data,
                reward_model,
                args.gail_batch_size,
                invdyn_model=invdyn_model,
                info_model=info_model,
                debug=args.debug,
                model_all=args.model_all,
                curriculum=curriculum,
                rew_aug=args.rew_aug,
                use_replay_buffer=args.use_replay_buffer,
                **kwargs
                )

    runner = RLLabRunner(algo, args, exp_dir)
    runner.train()