示例#1
0
def main(exp_name=None, fusion=False, latent_dim=3):
    max_path_length = 100
    info_coeff = 0.1
    imitation_coeff = 0.01
    batch_size = 16
    meta_batch_size = 50
    max_itrs = 20
    pre_epoch = 1000
    entropy_weight = 1.0
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim)

    # contexual policy pi(a|s,m)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    # approximate posterior q(m|tau)
    context_encoder_spec = EnvSpec(
        observation_space=Box(
            np.tile(
                np.concatenate((env.observation_space.low[:-latent_dim],
                                env.action_space.low)), max_path_length),
            np.tile(
                np.concatenate((env.observation_space.high[:-latent_dim],
                                env.action_space.high)), max_path_length)),
        action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
    )
    context_encoder = GaussianMLPPolicy(name='context_encoder',
                                        env_spec=context_encoder_spec,
                                        hidden_sizes=(128, 128))

    pretrain_model = Pretrain(experts,
                              policy,
                              context_encoder,
                              env,
                              latent_dim,
                              batch_size=400,
                              kl_weight=0.1,
                              epoch=pre_epoch)
    # pretrain_model = None
    if pretrain_model is None:
        pre_epoch = 0

    irl_model = InfoAIRL(env=env,
                         policy=policy,
                         context_encoder=context_encoder,
                         reward_arch=reward_arch,
                         reward_arch_args=reward_arch_args,
                         expert_trajs=experts,
                         state_only=True,
                         max_path_length=max_path_length,
                         fusion=fusion,
                         max_itrs=max_itrs,
                         meta_batch_size=meta_batch_size,
                         imitation_coeff=imitation_coeff,
                         info_coeff=info_coeff,
                         latent_dim=latent_dim)

    algo = MetaIRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        randomize_policy=True,
        pretrain_model=pretrain_model,
        n_itr=3000,
        meta_batch_size=meta_batch_size,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        train_irl=True,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    if fusion:
        dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)
    else:
        dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)

    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
示例#2
0
def main(exp_name=None, params_folder='data/ant_state_irl'):
    # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True))
    env = TfEnv(
        CustomGymEnv('DisabledAnt-v0',
                     record_video=False,
                     record_log=False,
                     force_reset=False))

    irl_itr = 90  # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr))
    prior_params = load_prior_params(params_file)
    '''q_itr = 400  # earlier IRL iterations overfit less; 100 seems to work well.
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr))
    prior_params_q = load_prior_params(params_file)'''

    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10)
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      score_discrim=False)
    empw_model = Empowerment(env=env, max_itrs=1)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    algo = IRLTRPO(
        init_irl_params=prior_params['irl_params'],
        init_empw_params=None,  #prior_params['empw_params'],
        init_qvar_params=None,  #prior_params['qvar_params'],
        init_policy_params=prior_params['policy_params'],  #None
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=True,
        train_empw=True,
        train_qvar=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
        # plot=True,
    )

    with rllab_logdir(algo=algo, dirname='data/ant_transfer'):  #%s'%exp_name):
        #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name):
        with tf.Session():
            algo.train()
示例#3
0
from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline
from sandbox.rocky.tf.policies.conv_nn_policy import ConvNNPolicy
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.algos.trpo import TRPO
from rllab.misc.instrument import stub, run_experiment_lite
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.envs.gym_env import GymEnv
import itertools

stub(globals())

# Param ranges
seeds = range(5)

for seed in seeds:
    mdp = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v2',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))
    
    policy = ConvNNPolicy(
        "conv_policy",
        env_spec=mdp.spec,
        conv_filters=(32, 32, 32, 32),
        conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)),
        conv_strides=(2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256,),
    )

    baseline = GaussianConvBaseline(
        mdp.spec,
        regressor_args={
            'conv_filters':(32, 32, 32, 32),
示例#4
0


for l2loss_std_mult in l2loss_std_mult_list:
    for post_std_modifier_train in post_std_modifier_train_list:
        for post_std_modifier_test in post_std_modifier_test_list:
            for pre_std_modifier in pre_std_modifier_list:
                for fast_learning_rate in fast_learning_rates:
                    for beta_steps in beta_steps_list:
                        for bas in baselines:
                            stub(globals())

                            seed = 1
                            #env = TfEnv(normalize(GymEnv("Pusher-v0", force_reset=True, record_video=False)))  #TODO: force_reset was True
                            #xml_filepath ='home/kevin/rllab_copy/vendor/local_mujoco_models/ensure_woodtable_distractor_pusher%s.xml' % seed
                            env = TfEnv(normalize(ReacherEnv()))

#                            expert_policy = pickle.load()

                            policy = MAMLGaussianMLPPolicy(
                                name="policy",
                                env_spec=env.spec,
                                grad_step_size=fast_learning_rate,
                                hidden_nonlinearity=tf.nn.relu,
                                hidden_sizes=(100, 100),
                            )
                            if bas == 'zero':
                                baseline = ZeroBaseline(env_spec=env.spec)
                            elif 'linear' in bas:
                                baseline = LinearFeatureBaseline(env_spec=env.spec)
                            else:
示例#5
0
from __future__ import print_function
from __future__ import absolute_import

from sandbox.rocky.tf.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
示例#6
0
 for ism in importance_sampling_modifier_list:
     for limit_demos_num in limit_demos_num_list:
         for l2loss_std_mult in l2loss_std_mult_list:
             for post_std_modifier_train in post_std_modifier_train_list:
                 for post_std_modifier_test in post_std_modifier_test_list:
                     for pre_std_modifier in pre_std_modifier_list:
                         for fast_learning_rate in fast_learning_rates:
                             for beta_steps, adam_steps in beta_adam_steps_list:
                                 for bas in baselines:
                                     stub(globals())
                                     tf.set_random_seed(
                                         seed)
                                     np.random.seed(seed)
                                     rd.seed(seed)
                                     env = TfEnv(
                                         normalize(
                                             Reacher7DofMultitaskEnv(
                                             )))
                                     exp_name = str(
                                         'R7_IL'
                                         # +time.strftime("%D").replace("/", "")[0:4]
                                         + goals_suffix +
                                         "_" + str(seed)
                                         # + str(envseed)
                                         + (""
                                            if use_corr_term
                                            else "nocorr")
                                         # + str(int(use_maml))
                                         + ('_fbs' + str(
                                             fast_batch_size
                                         ) if
                                            fast_batch_size
示例#7
0
def experiment(variant, comet_logger=None):
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.misc.instrument import stub, run_experiment_lite
    
    from sandbox.rocky.tf.algos.vpg import VPG as vpg_basic
    from sandbox.rocky.tf.algos.vpg_biasADA import VPG as vpg_biasADA
    from sandbox.rocky.tf.algos.vpg_fullADA import VPG as vpg_fullADA
    from sandbox.rocky.tf.algos.vpg_conv import VPG as vpg_conv
    from sandbox.rocky.tf.algos.ppo import PPO as ppo
    
    # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \
        MAMLGaussianMLPPolicy as biasAda_Bias_policy
    
    from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv
    from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv
    from multiworld.envs.mujoco.sawyer_xyz.multi_domain.push_door import Sawyer_MultiDomainEnv
    from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_coffee import SawyerCoffeeEnv
    
    from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing
    from multiworld.core.flat_goal_env import FlatGoalEnv
    from multiworld.core.finn_maml_env import FinnMamlEnv
    from multiworld.core.wrapper_env import NormalizedBoxEnv
    from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler
    # import gym
    
    from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \
            MAMLGaussianMLPPolicy as PPO_policy
    
    import pickle
    import argparse
    from sandbox.rocky.tf.envs.base import TfEnv
    import csv
    import joblib
    import numpy as np
    import pickle
    import tensorflow as tf
    
    print("%%%%%%%%%%%%%%%%%", comet_logger)
    seed = variant['seed']
    log_dir = variant['log_dir']
    n_parallel = variant['n_parallel']

    setup(seed, n_parallel, log_dir)

    init_file = variant['init_file']
    taskIndex = variant['taskIndex']
    n_itr = variant['n_itr']
    default_step = variant['default_step']
    policyType = variant['policyType']
    envType = variant['envType']

    tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant['tasksFile'] + '.pkl'
    tasks = pickle.load(open(tasksFile, 'rb'))

    max_path_length = variant['max_path_length']

    use_images = 'conv' in policyType
    print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$")
    if 'MultiDomain' in envType:
        baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Push' in envType:
        baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length)


    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))
    elif 'Biped' in envType:
        # import terrainRLSim
        # from simAdapter import terrainRLSim
        import simAdapter
        import gym
        env = gym.make("PD_Biped2D_Gaps_Terrain-v0")
        env = TfEnv(normalize(env))
    elif 'Coffee' in envType:
        baseEnv = SawyerCoffeeEnv(mpl=max_path_length)

    else:
        raise AssertionError('')

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx')))

    # baseline = ZeroBaseline(env_spec=env.spec)
    baseline = LinearFeatureBaseline(env_spec = env.spec)
    batch_size = variant['batch_size']

    if policyType == 'fullAda_Bias':

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = vpg_fullADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger
        )

    elif policyType == 'biasAda_Bias':

        algo = vpg_biasADA(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir
        )
        
    elif policyType == 'PPO':

        policy = PPO_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=variant['init_flr'],
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(128, 128),
            init_flr_full=variant['init_flr'],
            latent_dim=variant['ldim'],
            learn_std=False
        )
        
        algo = ppo(
            env=env,
            policy=policy,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            # noise_opt = True,
            default_step=default_step,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir,
            comet_logger=comet_logger
        )

    elif policyType == 'basic':

        algo = vpg_basic(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,
            max_path_length=max_path_length,
            n_itr=n_itr,
            # step_size=10.0,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),

            reset_arg=taskIndex,
            optimizer=None,
            optimizer_args={'init_learning_rate': default_step,
                            'tf_optimizer_args': {'learning_rate': 0.5 * default_step},
                            'tf_optimizer_cls': tf.train.GradientDescentOptimizer},
            log_dir=log_dir
            # extra_input="onehot_exploration", # added by RK 6/19
            # extra_input_dim=5, # added by RK 6/19
        )


    elif 'conv' in policyType:

        algo = vpg_conv(
            env=env,
            policy=None,
            load_policy=init_file,
            baseline=baseline,
            batch_size=batch_size,  # 2x
            max_path_length=max_path_length,
            n_itr=n_itr,
            sampler_cls=VectorizedSampler,  # added by RK 6/19
            sampler_args=dict(n_envs=1),
            # noise_opt = True,
            default_step=default_step,
            # reset_arg=np.asscalar(taskIndex),
            reset_arg=taskIndex,
            log_dir=log_dir

        )

    else:
        raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias')

    algo.train()
示例#8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str, help="name of gym env")
    parser.add_argument('dataset_path',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels')
    # Training parameters
    parser.add_argument('--val_ratio',
                        type=float,
                        default=0.1,
                        help="ratio of validation sets")
    parser.add_argument('--num_itr', type=int, default=10000000)
    parser.add_argument('--val_freq', type=int, default=1000)
    parser.add_argument('--log_freq', type=int, default=200)
    parser.add_argument('--save_freq', type=int, default=5000)

    # ICM parameters
    parser.add_argument('--init_lr', type=float, default=1e-4)
    parser.add_argument('--forward_weight',
                        type=float,
                        default=0.8,
                        help="the ratio of forward loss vs inverse loss")
    parser.add_argument('--cos_forward',
                        action='store_true',
                        help="whether to use cosine forward loss")
    # parser.add_argument('--norm_input', action='store_true',
    # 					help="whether to normalize observation input")

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv(args.env_name,record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    # Get dataset
    dataset_names = list(
        map(lambda file_name: osp.join(args.dataset_path, file_name),
            listdir(args.dataset_path)))
    val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)]
    train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):]

    train_queue = tf.train.string_input_producer(train_set_names,
                                                 num_epochs=None)
    val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None)

    train_obs, train_next_obs, train_action = read_and_decode(
        train_queue, env.observation_space.shape, env.action_space.shape)
    val_obs, val_next_obs, val_action = read_and_decode(
        val_queue, env.observation_space.shape, env.action_space.shape)

    # Build ICM model
    # if args.norm_input:
    # 	train_obs = train_obs * (1./255) - 0.5
    # 	train_next_obs = train_next_obs *(1./255) - 0.5
    # 	val_obs = val_obs * (1./255) - 0.5
    # 	val_next_obs = val_next_obs * (1./255) - 0.5
    # 	train_obs = tf.cast(train_obs, tf.float32) / 255.0 - 0.5
    # 	train_next_obs = tf.cast(train_next_obs, tf.float32) / 255.0 - 0.5
    # 	val_obs = tf.cast(val_obs, tf.float32) / 255.0 - 0.5
    # 	val_next_obs = tf.cast(val_next_obs, tf.float32) / 255.0 - 0.5
    # else:
    # 	train_obs = tf.cast(train_obs, tf.float32)
    # 	train_next_obs = tf.cast(train_next_obs, tf.float32)
    # 	val_obs = tf.cast(val_obs, tf.float32)
    # 	val_next_obs = tf.cast(val_next_obs, tf.float32)

    _encoder = ConvEncoder(
        feature_dim=256,
        input_shape=env.observation_space.shape,
        conv_filters=(64, 64, 64, 32),
        conv_filter_sizes=((5, 5), (5, 5), (5, 5), (3, 3)),
        conv_strides=(3, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.elu,
    )
    _inverse_model = InverseModel(
        feature_dim=256,
        env_spec=env.spec,
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.tanh,
        output_activation=tf.nn.tanh,
    )
    _forward_model = ForwardModel(
        feature_dim=256,
        env_spec=env.spec,
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.elu,
    )

    sess = tf.Session()
    _encoder.sess = sess
    _inverse_model.sess = sess
    _forward_model.sess = sess

    with sess.as_default():
        # Initialize variables for get_copy to work
        sess.run(tf.initialize_all_variables())

        train_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=train_obs)
        train_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=train_next_obs)
        train_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=train_encoder1.output,
            feature_input2=train_encoder2.output)
        train_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=train_encoder1.output, action_input=train_action)

        val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs)
        val_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=val_next_obs)
        val_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=val_encoder1.output,
            feature_input2=val_encoder2.output)
        val_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=val_encoder1.output, action_input=val_action)
        if args.cos_forward:
            train_forward_loss = cos_loss(train_encoder2.output,
                                          train_forward_model.output)
            val_forward_loss = cos_loss(val_encoder2.output,
                                        val_forward_model.output)
        else:
            train_forward_loss = tf.reduce_mean(
                tf.square(train_encoder2.output - train_forward_model.output))
            val_forward_loss = tf.reduce_mean(
                tf.square(val_encoder2.output - val_forward_model.output))

        train_inverse_loss = tf.reduce_mean(
            tf.square(train_action - train_inverse_model.output))
        val_inverse_loss = tf.reduce_mean(
            tf.square(val_action - val_inverse_model.output))
        train_total_loss = args.forward_weight * train_forward_loss + (
            1. - args.forward_weight) * train_inverse_loss
        val_total_loss = args.forward_weight * val_forward_loss + (
            1. - args.forward_weight) * val_inverse_loss
        icm_opt = tf.train.AdamOptimizer(
            args.init_lr).minimize(train_total_loss)

        # Setup summaries
        summary_writer = tf.summary.FileWriter(args.tfboard_path,
                                               graph=tf.get_default_graph())

        train_inverse_loss_summ = tf.summary.scalar("train/icm_inverse_loss",
                                                    train_inverse_loss)
        train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss",
                                                    train_forward_loss)
        train_total_loss_summ = tf.summary.scalar("train/icm_total_loss",
                                                  train_total_loss)
        val_inverse_loss_summ = tf.summary.scalar("val/icm_inverse_loss",
                                                  val_inverse_loss)
        val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss",
                                                  val_forward_loss)
        val_total_loss_summ = tf.summary.scalar("val/icm_total_loss",
                                                val_total_loss)

        train_summary_op = tf.summary.merge([
            train_inverse_loss_summ, train_forward_loss_summ,
            train_total_loss_summ
        ])
        val_summary_op = tf.summary.merge([
            val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ
        ])

        logger.log("Finished creating ICM model")

        sess.run(tf.initialize_all_variables())

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            for timestep in range(args.num_itr):
                if timestep % args.log_freq == 0:
                    logger.log("Start itr {}".format(timestep))
                    _, train_summary = sess.run([icm_opt, train_summary_op])
                else:
                    sess.run(icm_opt)

                if timestep % args.log_freq == 0:
                    summary_writer.add_summary(train_summary, timestep)
                if timestep % args.save_freq == 0:
                    save_snapshot(_encoder, _inverse_model, _forward_model,
                                  args.tfmodel_path)

                if timestep % args.val_freq == 0:
                    val_summary = sess.run(val_summary_op)
                    summary_writer.add_summary(val_summary, timestep)

        except KeyboardInterrupt:
            print("End training...")
            pass

        coord.join(threads)
        sess.close()
示例#9
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = variant['n_parallel']
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)
    expertDataLoc = variant['expertDataLoc']
    expertDataItr = variant['expertDataItr']

    fast_learning_rate = variant['flr']

    fast_batch_size = variant[
        'fbs']  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
    meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 150
    num_grad_updates = 1
    meta_step_size = variant['mlr']

    regionSize = variant['regionSize']

    if regionSize == '20X20':
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl'

    else:
        assert regionSize == '60X30'
        tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl'

    tasks = pickle.load(open(tasksFile, 'rb'))
    envType = variant['envType']

    if envType == 'Push':
        baseEnv = SawyerPushEnv(tasks=tasks)
    else:
        assert (envType) == 'PickPlace'

        baseEnv = SawyerPickPlaceEnv(tasks=tasks)

    env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation']))
    env = TfEnv(NormalizedBoxEnv(env))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=fast_learning_rate,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=variant['hidden_sizes'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for grad update
        max_path_length=max_path_length,
        meta_batch_size=meta_batch_size,
        num_grad_updates=num_grad_updates,
        n_itr=1000,
        use_maml=True,
        step_size=meta_step_size,
        plot=False,
        numExpertPolicies=20,
        expertDataInfo={
            'expert_loc': expertDataLoc,
            'expert_itr': expertDataItr
        })

    algo.train()
示例#10
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, default='path to snapshot file')
    parser.add_argument('--pixel', action='store_true')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--multistep', action='store_true')
    parser.add_argument('--step_size', type=int, default=5)
    parser.add_argument('--zero_action', action='store_true')
    parser.add_argument('--gt_action', action='store_true')

    args = parser.parse_args()

    with tf.Session() as sess:
        data = joblib.load(args.file)
        _encoder = data['encoder']
        _inverse_model = data['inverse_model']
        _forward_model = data['forward_model']

        if args.pixel:
            env = TfEnv(normalize(env=GymEnv(PIXEL_ENV,record_video=False, \
            log_dir='/tmp/gym_test',record_log=False)))
        else:
            env = TfEnv(normalize(env=GymEnv(STATE_ENV,record_video=False, \
            log_dir='/tmp/gym_test',record_log=False)))

        # Rebuild models
        act_space = env.action_space
        obs_space = env.observation_space
        qpos_dim = env.wrapped_env._wrapped_env.env.env.init_qpos.shape[0]

        s1_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        s2_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        a_ph = tf.placeholder(tf.float32, [None, act_space.flat_dim])
        
        clipped_a = tf.clip_by_value(a_ph, -1.0, 1.0)
        encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph)
        encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph)
        inverse_model = _inverse_model.get_weight_tied_copy(feature_input1=encoder1.output, 
                                                            feature_input2=encoder2.output)
        forward_model = _forward_model.get_weight_tied_copy(feature_input=encoder1.output,
                                                            action_input=clipped_a)

        # Load test data
        dataset_paths, datasets = load_dataset(args.pixel, args.multistep)


        env.reset()
        for dataset_path, data_dict in zip(dataset_paths, datasets):
            
            ef_xyz_pred_diff = []
            ef_xyz_diff = []
            action_diff = []
            qpos_diff = []
            qpos_pred_diff = []
            if args.multistep:
                print ("===== Using multisteping testing, stepsize: %d" % args.step_size)
            
            print ("========================================")
            print ("===== Evaluating inverse model on %s" % dataset_path)
            # states = data_dict['states']
            # next_states = data_dict['next_states']
            # obs = data_dict['obs']
            # next_obs = data_dict['next_obs']
            # actions = data_dict['actions']
            if args.multistep:
                states, next_states, obs, next_obs, actions = load_data_multistep(data_dict, pixel=args.pixel, step_size=args.step_size)
            else:
                states, next_states, obs, next_obs, actions = load_data(data_dict, args.pixel)
            actions = np.clip(actions, -1.0, 1.0)

            if args.render:
                fig, [ax1, ax2, ax3] = plt.subplots(1, 3)
                plt.ion()
                ax1.set_title("t=0")
                ax2.set_title("t=1 after action")
                ax3.set_title("t=1 after predicted action")


            for state, next_state, ob, next_ob, action in zip(states, next_states, obs, next_obs, actions):
                # print (state.shape)
                if args.multistep:
                    # Set state, get real img1
                    set_state(env, state[0], qpos_dim)
                    _end_ef_pos = get_ef_pos(env)
                    _qpos = get_qpos(env)
                    if args.render:
                        img = get_render_img(env)

                    o = ob[0]
                    # next_o = next_ob[0]
                    next_o = next_ob[-1]
                    for _ in range(args.step_size):
                        # Get predicted action from inverse model
                        pred_action = sess.run(inverse_model.output, {
                            s1_ph: [o],
                            s2_ph: [next_o],
                        })[0]
                        if args.gt_action:
                        	pred_action = action[_]

                        if args.zero_action:
                        	pred_action = np.zeros_like(action[_])

                        # ob = next_o
                        # next_o = next_ob[_]

                        # Step predicted action
                        o, r, d, env_info = env.step(pred_action)

                    # Get sim_img2 and sim ef position
                    s_end_ef_pos = get_ef_pos(env)
                    s_qpos = get_qpos(env)
                    if args.render:
                        s_img = get_render_img(env)


                    # Get real img2 and real ef position
                    set_state(env, next_state[args.step_size-1], qpos_dim)
                    o_end_ef_pos = get_ef_pos(env)
                    o_qpos = get_qpos(env)
                    if args.render:
                        o_img = get_render_img(env)


                else:
                    # Set state, get real img1
                    # import pdb; pdb.set_trace()
                    set_state(env, state, qpos_dim)
                    _end_ef_pos = get_ef_pos(env)
                    # print ("Real: ", _end_ef_pos)
                    _qpos = get_qpos(env)
                    if args.render:
                        img = get_render_img(env)

                    # Get predicted action from inverse model
                    pred_action = sess.run(inverse_model.output, {
                        s1_ph: [ob],
                        s2_ph: [next_ob],
                    })[0]

                    if args.zero_action:
                        pred_action = np.zeros_like(pred_action)
                    if args.gt_action:
                    	pred_action = action


                    # Step action
                    env.step(pred_action)

                    # print (np.linalg.norm(next_state - get_state(env)))

                    # Get sim_img2 and sim ef position
                    s_end_ef_pos = get_ef_pos(env)
                    # print ("Sim pos", s_end_ef_pos)
                    s_qpos = get_qpos(env)
                    if args.render:
                        s_img = get_render_img(env)

                    # Get real img2 and real ef position
                    set_state(env, next_state, qpos_dim)
                    o_end_ef_pos = get_ef_pos(env)
                    o_qpos = get_qpos(env)

                    # print (np.linalg.norm(s_qpos - o_qpos))
                    
                    # print (np.linalg.norm(o_end_ef_pos - s_end_ef_pos))

                if args.render:
                    o_img = get_render_img(env)
                
                if args.render:
                    ax1.imshow(img)
                    ax2.imshow(o_img)
                    ax3.imshow(s_img)
                    plt.show()
                    plt.pause(0.1)

                    # print ("Actual action: ", action)
                    # print ("Predicted action: ", pred_action)

                ef_xyz_pred_diff.append(np.linalg.norm(o_end_ef_pos - s_end_ef_pos))
                ef_xyz_diff.append(np.linalg.norm(o_end_ef_pos - _end_ef_pos))
                qpos_pred_diff.append(np.linalg.norm(o_qpos - s_qpos))
                qpos_diff.append(np.linalg.norm(o_qpos - _qpos))
                
                action_diff.append(((action - pred_action)**2).mean())

            # print ("===== 1. real s1, real s2 end effector position L2 distance       mean:  %.5f, std: %.5f" % (np.mean(ef_xyz_diff), np.std(ef_xyz_diff)))
            # print ("===== 2. real s2, sim  s2 end effector position L2 distance       mean:  %.5f, std: %.5f" % (np.mean(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff)))
            # print ("===== 3. real s1, real s2 joint position        L2 distance       mean:  %.5f, std: %.5f" % (np.mean(qpos_diff), np.std(qpos_diff)))
            # print ("===== 4. real s2, sim  s2 joint position        L2 distance       mean:  %.5f, std: %.5f" % (np.mean(qpos_pred_diff), np.std(qpos_pred_diff)))
           # if not args.multistep:
                #print ("===== 5. action - pred_action (per dim)      sq L2 distance       mean:  %.5f, std: %.5f" % (np.mean(action_diff), np.std(action_diff)))
            # print ("===== 6. action                                                   mean:  %.5f, std: %.5f" % (np.mean(np.abs(actions).mean(axis=1)), np.std(actions.mean(axis=1))))

            print ("===== 1. real s1, real s2 end effector position L2 distance       med:  %.5f, std: %.5f" % (np.median(ef_xyz_diff), np.std(ef_xyz_diff)))
            print ("===== 2. real s2, sim  s2 end effector position L2 distance       med:  %.5f, std: %.5f" % (np.median(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff)))
            print ("===== 3. real s1, real s2 joint position        L2 distance       med:  %.5f, std: %.5f" % (np.median(qpos_diff), np.std(qpos_diff)))
            print ("===== 4. real s2, sim  s2 joint position        L2 distance       med:  %.5f, std: %.5f" % (np.median(qpos_pred_diff), np.std(qpos_pred_diff)))
            if not args.multistep:
                    print ("===== 5. action - pred_action (per dim)      sq L2 distance       med:  %.5f, std: %.5f" % (np.median(action_diff), np.std(action_diff)))
            print ("===== 6. action                                                   med:  %.5f, std: %.5f" % (np.median(np.abs(np.median(actions, axis=1))), np.std(np.median(actions, axis=1))))
示例#11
0
# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml = True

for v in variants:
    direc = v['direc']
    oracle = v['oracle']

    if direc:
        if oracle:
            env = TfEnv(normalize(HalfCheetahEnvDirecOracle()))
        else:
            env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
    else:
        if oracle:
            env = TfEnv(normalize(HalfCheetahEnvOracle()))
        else:
            env = TfEnv(normalize(HalfCheetahEnvRand()))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100, 100),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
示例#12
0
def main(
    exp_name,
    rundir='data',
    irl_pkl='',
    ent_wt=1.0,
    trpo_anneal_steps=None,
    trpo_anneal_init_ent=None,
    trpo_step=0.01,
    init_pol_std=1.0,
    method=None,
    hid_size=None,
    hid_layers=None,
    switch_env=None,
):
    orig_env_name = get_name(irl_pkl)
    if switch_env is not None:
        this_env_name = switch_env
    else:
        this_env_name = orig_env_name
    print("Running on environment '%s'" % this_env_name)
    env = TfEnv(
        CustomGymEnv(this_env_name, record_video=False, record_log=False))

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(orig_env_name)
    env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain')

    folder = os.path.dirname(irl_pkl)

    prior_params = load_prior_params(irl_pkl)
    expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5)

    # For some reason IRLTRPO is responsible for setting weights in this code.
    # It would equally be possible to run global_variables_initializer()
    # ourselves and then do irl_model.set_params(prior_params) if we just
    # wanted to query energy, reward, etc. from the trained AIRL model without
    # using IRLTRPO.
    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        irl_model = AIRL(env=env,
                         expert_trajs=experts,
                         state_only=True,
                         freeze=True,
                         vairl=method == 'vairl',
                         vairl_beta=1e-4,
                         discrim_arch_args=disc_net_kwargs,
                         fitted_value_fn_arch_args=disc_net_kwargs)
    elif method in {'gail', 'vail'}:
        irl_model = GAIL(env,
                         expert_trajs=experts,
                         discrim_arch_args=disc_net_kwargs,
                         name=method,
                         freeze=True,
                         vail=method == 'vail')
    else:
        raise NotImplementedError("Don't know how to handle method '%s'" %
                                  method)

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=ent_wt,  # should be 1.0 but 0.1 seems to work better
        step_size=trpo_step,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        init_irl_params=prior_params,
        force_batch_sampler=True,
        entropy_anneal_init_weight=trpo_anneal_init_ent,
        entropy_anneal_steps=trpo_anneal_steps,
        retraining=True)
    irltrpo_kwargs.update(env_trpo_params)
    algo = IRLTRPO(**irltrpo_kwargs)
    folder_suffix = ''
    if switch_env is not None:
        # append lower case environment name to retrain folder path
        folder_suffix = '_%s' % switch_env.lower()
    with rllab_logdir(algo=algo,
                      dirname='%s/retrain%s' % (folder, folder_suffix)):
        with tf.Session():
            algo.train()
示例#13
0
from sandbox.rocky.tf.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite
from rllab.envs.gym_env import GymEnv

#env = TfEnv(normalize(CartpoleEnv(record_video=True, force_reset=True)))
envir = 'CartPole-v0'
env = TfEnv(normalize(GymEnv(envir, record_video=True, force_reset=True)))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
示例#14
0
def get_env(env_name, record_video=True, record_log=True, normalize_obs=False, **kwargs):
    env = TfEnv(normalize(GymEnv(env_name, record_video=record_video,
        record_log=record_log), normalize_obs=normalize_obs))
    return env
示例#15
0
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--fw_ratio', type=float, default=0.1)
    parser.add_argument('--init_lr', type=float, default=5e-4)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.99)

    args = parser.parse_args()

    # Param ranges
    seeds = range(2)

    for seed in seeds:
        mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \
        log_dir='/tmp/gym_test',record_log=False), normalize_obs=True))

        name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format(
            args.fw_ratio, args.init_lr)

        policy = GaussianMLPPolicy(
            "mlp_policy",
            env_spec=mdp.spec,
            hidden_sizes=(64, 64, 32),
            output_nonlinearity=tf.nn.tanh,
            clip_action=False,
        )

        baseline = LinearFeatureBaseline(mdp.spec, )

        batch_size = 50000
        algo = TRPO(
            env=mdp,
            policy=policy,
            baseline=baseline,
            batch_size=batch_size,
            whole_paths=True,
            max_path_length=1000,
            n_itr=1000,
            step_size=0.01,
            subsample_factor=1.0,
            sampler_cls=BatchSampler,
        )

        algorithm = ICM(
            mdp,
            algo,
            args.tfboard_path + "/%s_%d" % (name, seed),
            feature_dim=mdp.spec.observation_space.flat_dim,
            forward_weight=args.fw_ratio,
            external_reward_weight=0.0,
            replay_pool_size=1000000,
            init_learning_rate=args.init_lr,
            n_updates_per_iter=1000,
        )

        run_experiment_lite(algorithm.train(),
                            exp_prefix=name,
                            n_parallel=8,
                            snapshot_mode="gap",
                            snapshot_gap=200,
                            seed=seed,
                            mode="local")
示例#16
0
log_dir = "./MultiEnv/Data"

# generate TrainENV file
# TrainEnvNum = 500
# env = TfEnv(GridBase(params))
# env._wrapped_env.generate_grid=True
# env._wrapped_env.generate_b0_start_goal=True

# for i in range(TrainEnvNum):
#     env.reset()
#     params = dict(
#         env=env,
#     )
#     joblib.dump(params,log_dir+'/TrainEnv'+'/env_'+str(i)+'.pkl')
#     plot_env(env,save=True,path=log_dir+'/TrainEnv'+'/Map_'+str(i)+'.pdf')

# generate TestENV file
TestEnvNum = 50
env = TfEnv(GridBase(params))
env._wrapped_env.generate_grid = True
env._wrapped_env.generate_b0_start_goal = True

for i in range(TestEnvNum):
    env.reset()
    params = dict(env=env, )
    joblib.dump(params, log_dir + '/TestEnv2' + '/env_' + str(i) + '.pkl')
    plot_env(env,
             save=True,
             path=log_dir + '/TestEnv2' + '/Map_' + str(i) + '.pdf')
示例#17
0
params['obs_len'] = len(params['observe_directions'])
params['num_state'] = params['grid_n'] * params['grid_m']
params['traj_limit'] = 4 * (params['grid_n'] * params['grid_m']
                            )  # 4 * (params['grid_n'] + params['grid_m'])
params['R_step'] = [params['R_step']] * params['num_action']
params['R_step'][params['stayaction']] = params['R_stay']

env_ref = joblib.load('./env.pkl')['env']
grid = env_ref._wrapped_env.grid
b0 = env_ref._wrapped_env.b0
start_state = env_ref._wrapped_env.start_state
goal_state = env_ref._wrapped_env.goal_state
env = TfEnv(
    GridBase(params,
             grid=grid,
             b0=b0,
             start_state=start_state,
             goal_state=goal_state))
env._wrapped_env.generate_grid = False
env._wrapped_env.generate_b0_start_goal = False
env.reset()

log_dir = "./Data/obs_1goal20step0stay_1_gru"

tabular_log_file = osp.join(log_dir, "progress.csv")
text_log_file = osp.join(log_dir, "debug.log")
params_log_file = osp.join(log_dir, "params.json")
pkl_file = osp.join(log_dir, "params.pkl")

logger.add_text_output(text_log_file)
logger.add_tabular_output(tabular_log_file)
        return [2]


# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    task_var = v['task_var']

    if task_var == 0:
        env = TfEnv(normalize(AntEnvRandDirec()))
        task_var = 'direc'
    elif task_var == 1:
        env = TfEnv(normalize(AntEnvRand()))
        task_var = 'vel'
    elif task_var == 2:
        env = TfEnv(normalize(AntEnvRandGoal()))
        task_var = 'pos'
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )
示例#19
0
def experiment(variant):

    seed = variant['seed']
    n_parallel = 1
    log_dir = variant['log_dir']

    setup(seed, n_parallel, log_dir)

    fast_batch_size = variant['fbs']
    meta_batch_size = variant['mbs']
    adam_steps = variant['adam_steps']
    max_path_length = variant['max_path_length']

    dagger = variant['dagger']
    expert_policy_loc = variant['expert_policy_loc']

    ldim = variant['ldim']
    init_flr = variant['init_flr']
    policyType = variant['policyType']
    use_maesn = variant['use_maesn']
    EXPERT_TRAJ_LOCATION = variant['expertDataLoc']
    envType = variant['envType']

    tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[
        'tasksFile'] + '.pkl'

    all_tasks = pickle.load(open(tasksFile, 'rb'))
    assert meta_batch_size <= len(all_tasks)
    tasks = all_tasks[:meta_batch_size]

    use_images = 'conv' in policyType

    if 'Push' == envType:
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length)

    elif envType == 'sparsePush':
        baseEnv = SawyerPushEnv(tasks=tasks,
                                image=use_images,
                                mpl=max_path_length,
                                rewMode='l2Sparse')

    elif 'PickPlace' in envType:
        baseEnv = SawyerPickPlaceEnv(tasks=tasks,
                                     image=use_images,
                                     mpl=max_path_length)

    elif 'Door' in envType:
        baseEnv = SawyerDoorOpenEnv(tasks=tasks,
                                    image=use_images,
                                    mpl=max_path_length)

    elif 'Ant' in envType:
        env = TfEnv(normalize(AntEnvRandGoalRing()))

    elif 'claw' in envType:
        env = TfEnv(DClawScrewRandGoal())

    else:
        assert True == False

    if envType in ['Push', 'PickPlace', 'Door']:
        if use_images:
            obs_keys = ['img_observation']
        else:
            obs_keys = ['state_observation']
        env = TfEnv(
            NormalizedBoxEnv(
                FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys),
                            reset_mode='idx')))

    algoClass = MAMLIL
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    load_policy = variant['load_policy']

    if load_policy != None:
        policy = None
        load_policy = variant['load_policy']
        # if 'conv' in load_policy:
        #     baseline = ZeroBaseline(env_spec=env.spec)

    elif 'fullAda_Bias' in policyType:

        policy = fullAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'biasAda_Bias' in policyType:

        policy = biasAda_Bias_policy(name="policy",
                                     env_spec=env.spec,
                                     grad_step_size=init_flr,
                                     hidden_nonlinearity=tf.nn.relu,
                                     hidden_sizes=(100, 100),
                                     init_flr_full=init_flr,
                                     latent_dim=ldim)

    elif 'basic' in policyType:
        policy = basic_policy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    elif 'conv' in policyType:

        baseline = ZeroBaseline(env_spec=env.spec)

        policy = conv_policy(
            name="policy",
            latent_dim=ldim,
            policyType=policyType,
            env_spec=env.spec,
            init_flr=init_flr,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
            extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        )

    algo = algoClass(
        env=env,
        policy=policy,
        load_policy=load_policy,
        baseline=baseline,
        batch_size=fast_batch_size,  # number of trajs for alpha grad update
        max_path_length=max_path_length,
        meta_batch_size=
        meta_batch_size,  # number of tasks sampled for beta grad update
        num_grad_updates=num_grad_updates,  # number of alpha grad updates
        n_itr=1,  #100
        make_video=False,
        use_maml=True,
        use_pooled_goals=True,
        use_corr_term=use_corr_term,
        test_on_training_goals=test_on_training_goals,
        metalearn_baseline=False,
        # metalearn_baseline=False,
        limit_demos_num=limit_demos_num,
        test_goals_mult=1,
        step_size=meta_step_size,
        plot=False,
        beta_steps=beta_steps,
        adam_curve=None,
        adam_steps=adam_steps,
        pre_std_modifier=pre_std_modifier,
        l2loss_std_mult=l2loss_std_mult,
        importance_sampling_modifier=MOD_FUNC[''],
        post_std_modifier=post_std_modifier,
        expert_trajs_dir=EXPERT_TRAJ_LOCATION,
        expert_trajs_suffix='',
        seed=seed,
        extra_input=extra_input,
        extra_input_dim=(0 if extra_input is "" else extra_input_dim),
        plotDirPrefix=None,
        latent_dim=ldim,
        dagger=dagger,
        expert_policy_loc=expert_policy_loc)

    algo.train()
post_std_modifier_train_list = [1.0]
post_std_modifier_test_list = [0.00001]

use_maml = True

for post_std_modifier_train in post_std_modifier_train_list:
    for post_std_modifier_test in post_std_modifier_test_list:
        for pre_std_modifier in pre_std_modifier_list:
            for fast_learning_rate in fast_learning_rates:
                for bas in baselines:
                    stub(globals())

                    seed = 4
                    #env = TfEnv(normalize(GymEnv("Pusher-v0", force_reset=True, record_video=False)))  #TODO: force_reset was True
                    #xml_filepath ='home/rosen/rllab_copy/vendor/local_mujoco_models/ensure_woodtable_distractor_pusher%s.xml' % seed
                    env = TfEnv(normalize(PusherEnv(distractors=True)))

                    # policy = MAMLGaussianMLPPolicy(
                    #     name="policy",
                    #     env_spec=env.spec,
                    #     grad_step_size=fast_learning_rate,
                    #     hidden_nonlinearity=HIDDEN_NONLINEARITY[nonlinearity_option],
                    #     hidden_sizes=(net_size, net_size),
                    #     output_nonlinearity=OUTPUT_NONLINEARITY[nonlinearity_option],
                    #     std_modifier=pre_std_modifier,
                    # )
                    if bas == 'zero':
                        baseline = ZeroBaseline(env_spec=env.spec)
                    elif 'linear' in bas:
                        baseline = LinearFeatureBaseline(env_spec=env.spec)
                    else:
示例#21
0
                                    rd.seed(seed)

                                    ###
                                    seed %= 4294967294
                                    global seed_
                                    seed_ = seed
                                    rd.seed(seed)
                                    np.random.seed(seed)
                                    try:
                                        import tensorflow as tf

                                        tf.set_random_seed(seed)
                                    except Exception as e:
                                        print(e)
                                    print('using seed %s' % (str(seed)))
                                    env = TfEnv(normalize(PointEnvRandGoal()))
                                    policy = MAMLGaussianMLPPolicy(
                                        name="policy",
                                        env_spec=env.spec,
                                        grad_step_size=fast_learning_rate,
                                        hidden_nonlinearity=tf.nn.relu,
                                        hidden_sizes=(100, 100),
                                        std_modifier=pre_std_modifier,
                                    )
                                    if bas == 'zero':
                                        baseline = ZeroBaseline(env_spec=env.spec)
                                    elif 'linear' in bas:
                                        baseline = LinearFeatureBaseline(env_spec=env.spec)
                                    else:
                                        baseline = GaussianMLPBaseline(env_spec=env.spec)
                                    #expert_policy = PointEnvExpertPolicy(env_spec=env.spec)
示例#22
0
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from sandbox.bradly.third_person.policy.random_policy import RandomPolicy
from sandbox.bradly.third_person.algos.cyberpunk_trainer import CyberPunkTrainer
from sandbox.bradly.third_person.policy.expert_reacher import load_expert_reacher
from sandbox.bradly.third_person.envs.reacher import ReacherEnv
from sandbox.bradly.third_person.envs.reacher_two import ReacherTwoEnv

from sandbox.bradly.third_person.discriminators.discriminator import DomainConfusionDiscriminator
from sandbox.bradly.third_person.discriminators.discriminator import DomainConfusionVelocityDiscriminator

import tensorflow as tf


expert_env = TfEnv(normalize(ReacherEnv()))
novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True))
expert_fail_pol = RandomPolicy(expert_env.spec)

policy = GaussianMLPPolicy(
    name="novice_policy",
    env_spec=novice_env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=expert_env.spec)

algo = TRPO(
    env=novice_env,
    policy=policy,
post_std_modifier_test_list = [0.00001]
l2loss_std_mult_list = [1.0]

use_maml = True
for goals_suffix in goals_suffixes:
    for l2loss_std_mult in l2loss_std_mult_list:
        for post_std_modifier_train in post_std_modifier_train_list:
            for post_std_modifier_test in post_std_modifier_test_list:
                for pre_std_modifier in pre_std_modifier_list:
                    for fast_learning_rate in fast_learning_rates:
                        for beta_steps, adam_steps in beta_adam_steps_list:
                            for bas in baselines:
                                stub(globals())

                                seed = 1
                                env = TfEnv(
                                    normalize(HalfCheetahEnvRandSparse()))

                                policy = MAMLGaussianMLPPolicy(
                                    name="policy",
                                    env_spec=env.spec,
                                    grad_step_size=fast_learning_rate,
                                    hidden_nonlinearity=tf.nn.relu,
                                    hidden_sizes=(100, 100),
                                    std_modifier=pre_std_modifier,
                                )
                                if bas == 'zero':
                                    baseline = ZeroBaseline(env_spec=env.spec)
                                elif 'linear' in bas:
                                    baseline = LinearFeatureBaseline(
                                        env_spec=env.spec)
                                else:
from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv

from sandbox.rocky.tf.envs.base import TfEnv
from multiworld.core.flat_goal_env import FlatGoalEnv
from multiworld.core.finn_maml_env import FinnMamlEnv
from multiworld.core.wrapper_env import NormalizedBoxEnv

stub(globals())
rate = 0.01
mode = 'local'

import tensorflow as tf
for goal in range(1, 100):
    baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None),
                          obs_keys=['state_observation'])
    env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(baseEnv, reset_mode='task')))
    #env = WheeledEnvGoal()

    env = TfEnv(env)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_nonlinearity=tf.nn.relu,
                               hidden_sizes=(100, 100))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=150,
exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []
    for goal in goals:
        goal = list(goal)


        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )


        if initial_params_file is not None:
            policy = None

        baseline = LinearFeatureBaseline(env_spec=env.spec)
        algo = VPG(
            env=env,
            policy=policy,
            load_policy=initial_params_file,
示例#26
0
]

other_env_class_map = {"Cartpole": CartpoleEnv}

if args.env in supported_gym_envs:
    gymenv = GymEnv(args.env,
                    force_reset=True,
                    record_video=False,
                    record_log=False)
    # gymenv.env.seed(1)
else:
    gymenv = other_env_class_map[args.env]()

#TODO: assert continuous space

env = TfEnv(normalize(gymenv))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(100, 50, 25),
    hidden_nonlinearity=tf.nn.relu,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
示例#27
0
variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml = True

for v in variants:
    task_var = v['task_var']
    oracle = v['oracle']

    if task_var == 0:
        task_var = 'direc'
        exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvDirecOracle()))
        else:
            env = TfEnv(normalize(AntEnvRandDirec()))
    elif task_var == 1:
        task_var = 'vel'
        exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvOracle()))
        else:
            env = TfEnv(normalize(AntEnvRand()))
    elif task_var == 2:
        print("HERE")
        task_var = 'pos'
        exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvRandGoalOracle()))
示例#28
0
key_path = '/home/ubuntu/.ssh/id_rsa_dl'

port = 22
# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 500
num_grad_updates = 1
use_maml = True

for v in variants:
    task_var = v['task_var']

    if task_var == 0:
        env = TfEnv(normalize(CellRobotRandDirectpi4Env()))
        task_var = 'directpi-4'
    elif task_var == 1:
        env = TfEnv(normalize(CellRobotRandDirectEnv()))
        task_var = 'direc'
    elif task_var == 2:
        env = TfEnv(normalize(CellRobotRandDirect2Env()))
        task_var = 'direc2'
    elif task_var == 3:
        env = TfEnv(normalize(CellRobotRandDirectpi4Env2()))  # -pi/4 固定 body
        task_var = 'direcpi-4-2'
    elif task_var == 4:
        env = TfEnv(normalize(CellRobotRandDirectBodyEnv()))  #利用body位置做sate
        task_var = 'direc-body'

    exp_name = 'Cellrobot_trpo_maml' + task_var + '_' + str(
示例#29
0
    1e-3
]  # 1e-3 works well for 1 step, trying lower for 2 step, trying 1e-2 for large batch
fast_learning_rates = [0.001]  # 0.5 works for [0.1, 0.2], too high for 2 step
baselines = ['linear']
fast_batch_size = 20  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
meta_batch_size = 20  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
max_path_length = 500
num_grad_updates = 1
use_sensitive = True

for fast_learning_rate in fast_learning_rates:
    for learning_rate in learning_rates:
        for bas in baselines:
            stub(globals())

            env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
            policy = SensitiveGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100, 100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif bas == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = SensitiveTRPO(
                #algo = SensitiveVPG(
示例#30
0
from envs.bullet.cartpole_bullet import CartPoleBulletEnv
from sandbox.rocky.tf.algos.trpo import TRPO
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv

from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize

env = TfEnv(normalize(GymEnv("CartPoleBulletEnv-v0")))

policy = GaussianMLPPolicy(
    name="tf_gaussian_mlp",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8, ))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.999,
    step_size=0.01,
    force_batch_sampler=True,
    # Uncomment both lines (this and the plot parameter below) to enable plotting
    #plot=True,