def main(exp_name=None, fusion=False, latent_dim=3): max_path_length = 100 info_coeff = 0.1 imitation_coeff = 0.01 batch_size = 16 meta_batch_size = 50 max_itrs = 20 pre_epoch = 1000 entropy_weight = 1.0 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( 'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim) # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) pretrain_model = Pretrain(experts, policy, context_encoder, env, latent_dim, batch_size=400, kl_weight=0.1, epoch=pre_epoch) # pretrain_model = None if pretrain_model is None: pre_epoch = 0 irl_model = InfoAIRL(env=env, policy=policy, context_encoder=context_encoder, reward_arch=reward_arch, reward_arch_args=reward_arch_args, expert_trajs=experts, state_only=True, max_path_length=max_path_length, fusion=fusion, max_itrs=max_itrs, meta_batch_size=meta_batch_size, imitation_coeff=imitation_coeff, info_coeff=info_coeff, latent_dim=latent_dim) algo = MetaIRLTRPO( env=env, policy=policy, irl_model=irl_model, randomize_policy=True, pretrain_model=pretrain_model, n_itr=3000, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) if fusion: dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) else: dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train()
def main(exp_name=None, params_folder='data/ant_state_irl'): # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True)) env = TfEnv( CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) irl_itr = 90 # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100 #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr)) prior_params = load_prior_params(params_file) '''q_itr = 400 # earlier IRL iterations overfit less; 100 seems to work well. #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr)) prior_params_q = load_prior_params(params_file)''' experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10) irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, score_discrim=False) empw_model = Empowerment(env=env, max_itrs=1) t_empw_model = Empowerment(env=env, scope='t_efn', max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params['irl_params'], init_empw_params=None, #prior_params['empw_params'], init_qvar_params=None, #prior_params['qvar_params'], init_policy_params=prior_params['policy_params'], #None env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=False, train_irl=True, train_empw=True, train_qvar=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, # plot=True, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer'): #%s'%exp_name): #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name): with tf.Session(): algo.train()
from sandbox.rocky.tf.baselines.gaussian_conv_baseline import GaussianConvBaseline from sandbox.rocky.tf.policies.conv_nn_policy import ConvNNPolicy from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.algos.trpo import TRPO from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.envs.base import TfEnv from rllab.envs.gym_env import GymEnv import itertools stub(globals()) # Param ranges seeds = range(5) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v2',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) policy = ConvNNPolicy( "conv_policy", env_spec=mdp.spec, conv_filters=(32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3)), conv_strides=(2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), ) baseline = GaussianConvBaseline( mdp.spec, regressor_args={ 'conv_filters':(32, 32, 32, 32),
for l2loss_std_mult in l2loss_std_mult_list: for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for beta_steps in beta_steps_list: for bas in baselines: stub(globals()) seed = 1 #env = TfEnv(normalize(GymEnv("Pusher-v0", force_reset=True, record_video=False))) #TODO: force_reset was True #xml_filepath ='home/kevin/rllab_copy/vendor/local_mujoco_models/ensure_woodtable_distractor_pusher%s.xml' % seed env = TfEnv(normalize(ReacherEnv())) # expert_policy = pickle.load() policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else:
from __future__ import print_function from __future__ import absolute_import from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99,
for ism in importance_sampling_modifier_list: for limit_demos_num in limit_demos_num_list: for l2loss_std_mult in l2loss_std_mult_list: for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for beta_steps, adam_steps in beta_adam_steps_list: for bas in baselines: stub(globals()) tf.set_random_seed( seed) np.random.seed(seed) rd.seed(seed) env = TfEnv( normalize( Reacher7DofMultitaskEnv( ))) exp_name = str( 'R7_IL' # +time.strftime("%D").replace("/", "")[0:4] + goals_suffix + "_" + str(seed) # + str(envseed) + ("" if use_corr_term else "nocorr") # + str(int(use_maml)) + ('_fbs' + str( fast_batch_size ) if fast_batch_size
def experiment(variant, comet_logger=None): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.algos.vpg import VPG as vpg_basic from sandbox.rocky.tf.algos.vpg_biasADA import VPG as vpg_biasADA from sandbox.rocky.tf.algos.vpg_fullADA import VPG as vpg_fullADA from sandbox.rocky.tf.algos.vpg_conv import VPG as vpg_conv from sandbox.rocky.tf.algos.ppo import PPO as ppo # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \ MAMLGaussianMLPPolicy as biasAda_Bias_policy from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.envs.mujoco.sawyer_xyz.multi_domain.push_door import Sawyer_MultiDomainEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_coffee import SawyerCoffeeEnv from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler # import gym from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \ MAMLGaussianMLPPolicy as PPO_policy import pickle import argparse from sandbox.rocky.tf.envs.base import TfEnv import csv import joblib import numpy as np import pickle import tensorflow as tf print("%%%%%%%%%%%%%%%%%", comet_logger) seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant['tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$") if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Biped' in envType: # import terrainRLSim # from simAdapter import terrainRLSim import simAdapter import gym env = gym.make("PD_Biped2D_Gaps_Terrain-v0") env = TfEnv(normalize(env)) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) # baseline = ZeroBaseline(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger ) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir ) elif policyType == 'PPO': policy = PPO_policy( name="policy", env_spec=env.spec, grad_step_size=variant['init_flr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(128, 128), init_flr_full=variant['init_flr'], latent_dim=variant['ldim'], learn_std=False ) algo = ppo( env=env, policy=policy, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger ) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={'init_learning_rate': default_step, 'tf_optimizer_args': {'learning_rate': 0.5 * default_step}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # noise_opt = True, default_step=default_step, # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir ) else: raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str, help="name of gym env") parser.add_argument('dataset_path', type=str, help="path of training and validation dataset") parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels') # Training parameters parser.add_argument('--val_ratio', type=float, default=0.1, help="ratio of validation sets") parser.add_argument('--num_itr', type=int, default=10000000) parser.add_argument('--val_freq', type=int, default=1000) parser.add_argument('--log_freq', type=int, default=200) parser.add_argument('--save_freq', type=int, default=5000) # ICM parameters parser.add_argument('--init_lr', type=float, default=1e-4) parser.add_argument('--forward_weight', type=float, default=0.8, help="the ratio of forward loss vs inverse loss") parser.add_argument('--cos_forward', action='store_true', help="whether to use cosine forward loss") # parser.add_argument('--norm_input', action='store_true', # help="whether to normalize observation input") args = parser.parse_args() env = TfEnv(normalize(env=GymEnv(args.env_name,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) # Get dataset dataset_names = list( map(lambda file_name: osp.join(args.dataset_path, file_name), listdir(args.dataset_path))) val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)] train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):] train_queue = tf.train.string_input_producer(train_set_names, num_epochs=None) val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None) train_obs, train_next_obs, train_action = read_and_decode( train_queue, env.observation_space.shape, env.action_space.shape) val_obs, val_next_obs, val_action = read_and_decode( val_queue, env.observation_space.shape, env.action_space.shape) # Build ICM model # if args.norm_input: # train_obs = train_obs * (1./255) - 0.5 # train_next_obs = train_next_obs *(1./255) - 0.5 # val_obs = val_obs * (1./255) - 0.5 # val_next_obs = val_next_obs * (1./255) - 0.5 # train_obs = tf.cast(train_obs, tf.float32) / 255.0 - 0.5 # train_next_obs = tf.cast(train_next_obs, tf.float32) / 255.0 - 0.5 # val_obs = tf.cast(val_obs, tf.float32) / 255.0 - 0.5 # val_next_obs = tf.cast(val_next_obs, tf.float32) / 255.0 - 0.5 # else: # train_obs = tf.cast(train_obs, tf.float32) # train_next_obs = tf.cast(train_next_obs, tf.float32) # val_obs = tf.cast(val_obs, tf.float32) # val_next_obs = tf.cast(val_next_obs, tf.float32) _encoder = ConvEncoder( feature_dim=256, input_shape=env.observation_space.shape, conv_filters=(64, 64, 64, 32), conv_filter_sizes=((5, 5), (5, 5), (5, 5), (3, 3)), conv_strides=(3, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) _inverse_model = InverseModel( feature_dim=256, env_spec=env.spec, hidden_sizes=(256, ), hidden_activation=tf.nn.tanh, output_activation=tf.nn.tanh, ) _forward_model = ForwardModel( feature_dim=256, env_spec=env.spec, hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) sess = tf.Session() _encoder.sess = sess _inverse_model.sess = sess _forward_model.sess = sess with sess.as_default(): # Initialize variables for get_copy to work sess.run(tf.initialize_all_variables()) train_encoder1 = _encoder.get_weight_tied_copy( observation_input=train_obs) train_encoder2 = _encoder.get_weight_tied_copy( observation_input=train_next_obs) train_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=train_encoder1.output, feature_input2=train_encoder2.output) train_forward_model = _forward_model.get_weight_tied_copy( feature_input=train_encoder1.output, action_input=train_action) val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs) val_encoder2 = _encoder.get_weight_tied_copy( observation_input=val_next_obs) val_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=val_encoder1.output, feature_input2=val_encoder2.output) val_forward_model = _forward_model.get_weight_tied_copy( feature_input=val_encoder1.output, action_input=val_action) if args.cos_forward: train_forward_loss = cos_loss(train_encoder2.output, train_forward_model.output) val_forward_loss = cos_loss(val_encoder2.output, val_forward_model.output) else: train_forward_loss = tf.reduce_mean( tf.square(train_encoder2.output - train_forward_model.output)) val_forward_loss = tf.reduce_mean( tf.square(val_encoder2.output - val_forward_model.output)) train_inverse_loss = tf.reduce_mean( tf.square(train_action - train_inverse_model.output)) val_inverse_loss = tf.reduce_mean( tf.square(val_action - val_inverse_model.output)) train_total_loss = args.forward_weight * train_forward_loss + ( 1. - args.forward_weight) * train_inverse_loss val_total_loss = args.forward_weight * val_forward_loss + ( 1. - args.forward_weight) * val_inverse_loss icm_opt = tf.train.AdamOptimizer( args.init_lr).minimize(train_total_loss) # Setup summaries summary_writer = tf.summary.FileWriter(args.tfboard_path, graph=tf.get_default_graph()) train_inverse_loss_summ = tf.summary.scalar("train/icm_inverse_loss", train_inverse_loss) train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss", train_forward_loss) train_total_loss_summ = tf.summary.scalar("train/icm_total_loss", train_total_loss) val_inverse_loss_summ = tf.summary.scalar("val/icm_inverse_loss", val_inverse_loss) val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss", val_forward_loss) val_total_loss_summ = tf.summary.scalar("val/icm_total_loss", val_total_loss) train_summary_op = tf.summary.merge([ train_inverse_loss_summ, train_forward_loss_summ, train_total_loss_summ ]) val_summary_op = tf.summary.merge([ val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ ]) logger.log("Finished creating ICM model") sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for timestep in range(args.num_itr): if timestep % args.log_freq == 0: logger.log("Start itr {}".format(timestep)) _, train_summary = sess.run([icm_opt, train_summary_op]) else: sess.run(icm_opt) if timestep % args.log_freq == 0: summary_writer.add_summary(train_summary, timestep) if timestep % args.save_freq == 0: save_snapshot(_encoder, _inverse_model, _forward_model, args.tfmodel_path) if timestep % args.val_freq == 0: val_summary = sess.run(val_summary_op) summary_writer.add_summary(val_summary, timestep) except KeyboardInterrupt: print("End training...") pass coord.join(threads) sess.close()
def experiment(variant): seed = variant['seed'] n_parallel = variant['n_parallel'] log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) expertDataLoc = variant['expertDataLoc'] expertDataItr = variant['expertDataItr'] fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] regionSize = variant['regionSize'] if regionSize == '20X20': tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_v1.pkl' else: assert regionSize == '60X30' tasksFile = '/root/code/multiworld/multiworld/envs/goals/PickPlace_60X30.pkl' tasks = pickle.load(open(tasksFile, 'rb')) envType = variant['envType'] if envType == 'Push': baseEnv = SawyerPushEnv(tasks=tasks) else: assert (envType) == 'PickPlace' baseEnv = SawyerPickPlaceEnv(tasks=tasks) env = FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=['state_observation'])) env = TfEnv(NormalizedBoxEnv(env)) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=variant['hidden_sizes'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, numExpertPolicies=20, expertDataInfo={ 'expert_loc': expertDataLoc, 'expert_itr': expertDataItr }) algo.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('file', type=str, default='path to snapshot file') parser.add_argument('--pixel', action='store_true') parser.add_argument('--render', action='store_true') parser.add_argument('--multistep', action='store_true') parser.add_argument('--step_size', type=int, default=5) parser.add_argument('--zero_action', action='store_true') parser.add_argument('--gt_action', action='store_true') args = parser.parse_args() with tf.Session() as sess: data = joblib.load(args.file) _encoder = data['encoder'] _inverse_model = data['inverse_model'] _forward_model = data['forward_model'] if args.pixel: env = TfEnv(normalize(env=GymEnv(PIXEL_ENV,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) else: env = TfEnv(normalize(env=GymEnv(STATE_ENV,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) # Rebuild models act_space = env.action_space obs_space = env.observation_space qpos_dim = env.wrapped_env._wrapped_env.env.env.init_qpos.shape[0] s1_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) s2_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) a_ph = tf.placeholder(tf.float32, [None, act_space.flat_dim]) clipped_a = tf.clip_by_value(a_ph, -1.0, 1.0) encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph) encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph) inverse_model = _inverse_model.get_weight_tied_copy(feature_input1=encoder1.output, feature_input2=encoder2.output) forward_model = _forward_model.get_weight_tied_copy(feature_input=encoder1.output, action_input=clipped_a) # Load test data dataset_paths, datasets = load_dataset(args.pixel, args.multistep) env.reset() for dataset_path, data_dict in zip(dataset_paths, datasets): ef_xyz_pred_diff = [] ef_xyz_diff = [] action_diff = [] qpos_diff = [] qpos_pred_diff = [] if args.multistep: print ("===== Using multisteping testing, stepsize: %d" % args.step_size) print ("========================================") print ("===== Evaluating inverse model on %s" % dataset_path) # states = data_dict['states'] # next_states = data_dict['next_states'] # obs = data_dict['obs'] # next_obs = data_dict['next_obs'] # actions = data_dict['actions'] if args.multistep: states, next_states, obs, next_obs, actions = load_data_multistep(data_dict, pixel=args.pixel, step_size=args.step_size) else: states, next_states, obs, next_obs, actions = load_data(data_dict, args.pixel) actions = np.clip(actions, -1.0, 1.0) if args.render: fig, [ax1, ax2, ax3] = plt.subplots(1, 3) plt.ion() ax1.set_title("t=0") ax2.set_title("t=1 after action") ax3.set_title("t=1 after predicted action") for state, next_state, ob, next_ob, action in zip(states, next_states, obs, next_obs, actions): # print (state.shape) if args.multistep: # Set state, get real img1 set_state(env, state[0], qpos_dim) _end_ef_pos = get_ef_pos(env) _qpos = get_qpos(env) if args.render: img = get_render_img(env) o = ob[0] # next_o = next_ob[0] next_o = next_ob[-1] for _ in range(args.step_size): # Get predicted action from inverse model pred_action = sess.run(inverse_model.output, { s1_ph: [o], s2_ph: [next_o], })[0] if args.gt_action: pred_action = action[_] if args.zero_action: pred_action = np.zeros_like(action[_]) # ob = next_o # next_o = next_ob[_] # Step predicted action o, r, d, env_info = env.step(pred_action) # Get sim_img2 and sim ef position s_end_ef_pos = get_ef_pos(env) s_qpos = get_qpos(env) if args.render: s_img = get_render_img(env) # Get real img2 and real ef position set_state(env, next_state[args.step_size-1], qpos_dim) o_end_ef_pos = get_ef_pos(env) o_qpos = get_qpos(env) if args.render: o_img = get_render_img(env) else: # Set state, get real img1 # import pdb; pdb.set_trace() set_state(env, state, qpos_dim) _end_ef_pos = get_ef_pos(env) # print ("Real: ", _end_ef_pos) _qpos = get_qpos(env) if args.render: img = get_render_img(env) # Get predicted action from inverse model pred_action = sess.run(inverse_model.output, { s1_ph: [ob], s2_ph: [next_ob], })[0] if args.zero_action: pred_action = np.zeros_like(pred_action) if args.gt_action: pred_action = action # Step action env.step(pred_action) # print (np.linalg.norm(next_state - get_state(env))) # Get sim_img2 and sim ef position s_end_ef_pos = get_ef_pos(env) # print ("Sim pos", s_end_ef_pos) s_qpos = get_qpos(env) if args.render: s_img = get_render_img(env) # Get real img2 and real ef position set_state(env, next_state, qpos_dim) o_end_ef_pos = get_ef_pos(env) o_qpos = get_qpos(env) # print (np.linalg.norm(s_qpos - o_qpos)) # print (np.linalg.norm(o_end_ef_pos - s_end_ef_pos)) if args.render: o_img = get_render_img(env) if args.render: ax1.imshow(img) ax2.imshow(o_img) ax3.imshow(s_img) plt.show() plt.pause(0.1) # print ("Actual action: ", action) # print ("Predicted action: ", pred_action) ef_xyz_pred_diff.append(np.linalg.norm(o_end_ef_pos - s_end_ef_pos)) ef_xyz_diff.append(np.linalg.norm(o_end_ef_pos - _end_ef_pos)) qpos_pred_diff.append(np.linalg.norm(o_qpos - s_qpos)) qpos_diff.append(np.linalg.norm(o_qpos - _qpos)) action_diff.append(((action - pred_action)**2).mean()) # print ("===== 1. real s1, real s2 end effector position L2 distance mean: %.5f, std: %.5f" % (np.mean(ef_xyz_diff), np.std(ef_xyz_diff))) # print ("===== 2. real s2, sim s2 end effector position L2 distance mean: %.5f, std: %.5f" % (np.mean(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff))) # print ("===== 3. real s1, real s2 joint position L2 distance mean: %.5f, std: %.5f" % (np.mean(qpos_diff), np.std(qpos_diff))) # print ("===== 4. real s2, sim s2 joint position L2 distance mean: %.5f, std: %.5f" % (np.mean(qpos_pred_diff), np.std(qpos_pred_diff))) # if not args.multistep: #print ("===== 5. action - pred_action (per dim) sq L2 distance mean: %.5f, std: %.5f" % (np.mean(action_diff), np.std(action_diff))) # print ("===== 6. action mean: %.5f, std: %.5f" % (np.mean(np.abs(actions).mean(axis=1)), np.std(actions.mean(axis=1)))) print ("===== 1. real s1, real s2 end effector position L2 distance med: %.5f, std: %.5f" % (np.median(ef_xyz_diff), np.std(ef_xyz_diff))) print ("===== 2. real s2, sim s2 end effector position L2 distance med: %.5f, std: %.5f" % (np.median(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff))) print ("===== 3. real s1, real s2 joint position L2 distance med: %.5f, std: %.5f" % (np.median(qpos_diff), np.std(qpos_diff))) print ("===== 4. real s2, sim s2 joint position L2 distance med: %.5f, std: %.5f" % (np.median(qpos_pred_diff), np.std(qpos_pred_diff))) if not args.multistep: print ("===== 5. action - pred_action (per dim) sq L2 distance med: %.5f, std: %.5f" % (np.median(action_diff), np.std(action_diff))) print ("===== 6. action med: %.5f, std: %.5f" % (np.median(np.abs(np.median(actions, axis=1))), np.std(np.median(actions, axis=1))))
# should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml = True for v in variants: direc = v['direc'] oracle = v['oracle'] if direc: if oracle: env = TfEnv(normalize(HalfCheetahEnvDirecOracle())) else: env = TfEnv(normalize(HalfCheetahEnvRandDirec())) else: if oracle: env = TfEnv(normalize(HalfCheetahEnvOracle())) else: env = TfEnv(normalize(HalfCheetahEnvRand())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) baseline = LinearFeatureBaseline(env_spec=env.spec)
def main( exp_name, rundir='data', irl_pkl='', ent_wt=1.0, trpo_anneal_steps=None, trpo_anneal_init_ent=None, trpo_step=0.01, init_pol_std=1.0, method=None, hid_size=None, hid_layers=None, switch_env=None, ): orig_env_name = get_name(irl_pkl) if switch_env is not None: this_env_name = switch_env else: this_env_name = orig_env_name print("Running on environment '%s'" % this_env_name) env = TfEnv( CustomGymEnv(this_env_name, record_video=False, record_log=False)) if hid_size is None or hid_layers is None: assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size, init_pol_std \ = min_layers_hidsize_polstd_for(orig_env_name) env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain') folder = os.path.dirname(irl_pkl) prior_params = load_prior_params(irl_pkl) expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=5) # For some reason IRLTRPO is responsible for setting weights in this code. # It would equally be possible to run global_variables_initializer() # ourselves and then do irl_model.set_params(prior_params) if we just # wanted to query energy, reward, etc. from the trained AIRL model without # using IRLTRPO. disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, freeze=True, vairl=method == 'vairl', vairl_beta=1e-4, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs) elif method in {'gail', 'vail'}: irl_model = GAIL(env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, freeze=True, vail=method == 'vail') else: raise NotImplementedError("Don't know how to handle method '%s'" % method) pol_hid_sizes = (hid_size, ) * hid_layers policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=pol_hid_sizes, init_std=init_pol_std) irltrpo_kwargs = dict( env=env, policy=policy, irl_model=irl_model, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=ent_wt, # should be 1.0 but 0.1 seems to work better step_size=trpo_step, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), init_irl_params=prior_params, force_batch_sampler=True, entropy_anneal_init_weight=trpo_anneal_init_ent, entropy_anneal_steps=trpo_anneal_steps, retraining=True) irltrpo_kwargs.update(env_trpo_params) algo = IRLTRPO(**irltrpo_kwargs) folder_suffix = '' if switch_env is not None: # append lower case environment name to retrain folder path folder_suffix = '_%s' % switch_env.lower() with rllab_logdir(algo=algo, dirname='%s/retrain%s' % (folder, folder_suffix)): with tf.Session(): algo.train()
from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite from rllab.envs.gym_env import GymEnv #env = TfEnv(normalize(CartpoleEnv(record_video=True, force_reset=True))) envir = 'CartPole-v0' env = TfEnv(normalize(GymEnv(envir, record_video=True, force_reset=True))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99,
def get_env(env_name, record_video=True, record_log=True, normalize_obs=False, **kwargs): env = TfEnv(normalize(GymEnv(env_name, record_video=record_video, record_log=record_log), normalize_obs=normalize_obs)) return env
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--fw_ratio', type=float, default=0.1) parser.add_argument('--init_lr', type=float, default=5e-4) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.99) args = parser.parse_args() # Param ranges seeds = range(2) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v17',record_video=False, \ log_dir='/tmp/gym_test',record_log=False), normalize_obs=True)) name = 'trpo-state-v17-tf-icm-fw{}-initlr-{}-norm'.format( args.fw_ratio, args.init_lr) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, clip_action=False, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, args.tfboard_path + "/%s_%d" % (name, seed), feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=args.fw_ratio, external_reward_weight=0.0, replay_pool_size=1000000, init_learning_rate=args.init_lr, n_updates_per_iter=1000, ) run_experiment_lite(algorithm.train(), exp_prefix=name, n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
log_dir = "./MultiEnv/Data" # generate TrainENV file # TrainEnvNum = 500 # env = TfEnv(GridBase(params)) # env._wrapped_env.generate_grid=True # env._wrapped_env.generate_b0_start_goal=True # for i in range(TrainEnvNum): # env.reset() # params = dict( # env=env, # ) # joblib.dump(params,log_dir+'/TrainEnv'+'/env_'+str(i)+'.pkl') # plot_env(env,save=True,path=log_dir+'/TrainEnv'+'/Map_'+str(i)+'.pdf') # generate TestENV file TestEnvNum = 50 env = TfEnv(GridBase(params)) env._wrapped_env.generate_grid = True env._wrapped_env.generate_b0_start_goal = True for i in range(TestEnvNum): env.reset() params = dict(env=env, ) joblib.dump(params, log_dir + '/TestEnv2' + '/env_' + str(i) + '.pkl') plot_env(env, save=True, path=log_dir + '/TestEnv2' + '/Map_' + str(i) + '.pdf')
params['obs_len'] = len(params['observe_directions']) params['num_state'] = params['grid_n'] * params['grid_m'] params['traj_limit'] = 4 * (params['grid_n'] * params['grid_m'] ) # 4 * (params['grid_n'] + params['grid_m']) params['R_step'] = [params['R_step']] * params['num_action'] params['R_step'][params['stayaction']] = params['R_stay'] env_ref = joblib.load('./env.pkl')['env'] grid = env_ref._wrapped_env.grid b0 = env_ref._wrapped_env.b0 start_state = env_ref._wrapped_env.start_state goal_state = env_ref._wrapped_env.goal_state env = TfEnv( GridBase(params, grid=grid, b0=b0, start_state=start_state, goal_state=goal_state)) env._wrapped_env.generate_grid = False env._wrapped_env.generate_b0_start_goal = False env.reset() log_dir = "./Data/obs_1goal20step0stay_1_gru" tabular_log_file = osp.join(log_dir, "progress.csv") text_log_file = osp.join(log_dir, "debug.log") params_log_file = osp.join(log_dir, "params.json") pkl_file = osp.join(log_dir, "params.pkl") logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file)
return [2] # should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: task_var = v['task_var'] if task_var == 0: env = TfEnv(normalize(AntEnvRandDirec())) task_var = 'direc' elif task_var == 1: env = TfEnv(normalize(AntEnvRand())) task_var = 'vel' elif task_var == 2: env = TfEnv(normalize(AntEnvRandGoal())) task_var = 'pos' policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), )
def experiment(variant): seed = variant['seed'] n_parallel = 1 log_dir = variant['log_dir'] setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs'] meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] max_path_length = variant['max_path_length'] dagger = variant['dagger'] expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] init_flr = variant['init_flr'] policyType = variant['policyType'] use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size= meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=1, #100 make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc) algo.train()
post_std_modifier_train_list = [1.0] post_std_modifier_test_list = [0.00001] use_maml = True for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for bas in baselines: stub(globals()) seed = 4 #env = TfEnv(normalize(GymEnv("Pusher-v0", force_reset=True, record_video=False))) #TODO: force_reset was True #xml_filepath ='home/rosen/rllab_copy/vendor/local_mujoco_models/ensure_woodtable_distractor_pusher%s.xml' % seed env = TfEnv(normalize(PusherEnv(distractors=True))) # policy = MAMLGaussianMLPPolicy( # name="policy", # env_spec=env.spec, # grad_step_size=fast_learning_rate, # hidden_nonlinearity=HIDDEN_NONLINEARITY[nonlinearity_option], # hidden_sizes=(net_size, net_size), # output_nonlinearity=OUTPUT_NONLINEARITY[nonlinearity_option], # std_modifier=pre_std_modifier, # ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else:
rd.seed(seed) ### seed %= 4294967294 global seed_ seed_ = seed rd.seed(seed) np.random.seed(seed) try: import tensorflow as tf tf.set_random_seed(seed) except Exception as e: print(e) print('using seed %s' % (str(seed))) env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) #expert_policy = PointEnvExpertPolicy(env_spec=env.spec)
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from sandbox.bradly.third_person.policy.random_policy import RandomPolicy from sandbox.bradly.third_person.algos.cyberpunk_trainer import CyberPunkTrainer from sandbox.bradly.third_person.policy.expert_reacher import load_expert_reacher from sandbox.bradly.third_person.envs.reacher import ReacherEnv from sandbox.bradly.third_person.envs.reacher_two import ReacherTwoEnv from sandbox.bradly.third_person.discriminators.discriminator import DomainConfusionDiscriminator from sandbox.bradly.third_person.discriminators.discriminator import DomainConfusionVelocityDiscriminator import tensorflow as tf expert_env = TfEnv(normalize(ReacherEnv())) novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True)) expert_fail_pol = RandomPolicy(expert_env.spec) policy = GaussianMLPPolicy( name="novice_policy", env_spec=novice_env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=expert_env.spec) algo = TRPO( env=novice_env, policy=policy,
post_std_modifier_test_list = [0.00001] l2loss_std_mult_list = [1.0] use_maml = True for goals_suffix in goals_suffixes: for l2loss_std_mult in l2loss_std_mult_list: for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for beta_steps, adam_steps in beta_adam_steps_list: for bas in baselines: stub(globals()) seed = 1 env = TfEnv( normalize(HalfCheetahEnvRandSparse())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline( env_spec=env.spec) else:
from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from sandbox.rocky.tf.envs.base import TfEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv stub(globals()) rate = 0.01 mode = 'local' import tensorflow as tf for goal in range(1, 100): baseEnv = FlatGoalEnv(SawyerPushEnv(tasks=None), obs_keys=['state_observation']) env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(baseEnv, reset_mode='task'))) #env = WheeledEnvGoal() env = TfEnv(env) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=150,
exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: goal = list(goal) if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(PointEnvRandGoalOracle(goal=goal)) n_itr = 1 else: env = normalize(PointEnvRandGoal(goal=goal)) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, load_policy=initial_params_file,
] other_env_class_map = {"Cartpole": CartpoleEnv} if args.env in supported_gym_envs: gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) # gymenv.env.seed(1) else: gymenv = other_env_class_map[args.env]() #TODO: assert continuous space env = TfEnv(normalize(gymenv)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=5000,
variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml = True for v in variants: task_var = v['task_var'] oracle = v['oracle'] if task_var == 0: task_var = 'direc' exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvDirecOracle())) else: env = TfEnv(normalize(AntEnvRandDirec())) elif task_var == 1: task_var = 'vel' exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvOracle())) else: env = TfEnv(normalize(AntEnvRand())) elif task_var == 2: print("HERE") task_var = 'pos' exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvRandGoalOracle()))
key_path = '/home/ubuntu/.ssh/id_rsa_dl' port = 22 # should also code up alternative KL thing variants = VG().variants() max_path_length = 500 num_grad_updates = 1 use_maml = True for v in variants: task_var = v['task_var'] if task_var == 0: env = TfEnv(normalize(CellRobotRandDirectpi4Env())) task_var = 'directpi-4' elif task_var == 1: env = TfEnv(normalize(CellRobotRandDirectEnv())) task_var = 'direc' elif task_var == 2: env = TfEnv(normalize(CellRobotRandDirect2Env())) task_var = 'direc2' elif task_var == 3: env = TfEnv(normalize(CellRobotRandDirectpi4Env2())) # -pi/4 固定 body task_var = 'direcpi-4-2' elif task_var == 4: env = TfEnv(normalize(CellRobotRandDirectBodyEnv())) #利用body位置做sate task_var = 'direc-body' exp_name = 'Cellrobot_trpo_maml' + task_var + '_' + str(
1e-3 ] # 1e-3 works well for 1 step, trying lower for 2 step, trying 1e-2 for large batch fast_learning_rates = [0.001] # 0.5 works for [0.1, 0.2], too high for 2 step baselines = ['linear'] fast_batch_size = 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 500 num_grad_updates = 1 use_sensitive = True for fast_learning_rate in fast_learning_rates: for learning_rate in learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(HalfCheetahEnvRandDirec())) policy = SensitiveGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif bas == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = SensitiveTRPO( #algo = SensitiveVPG(
from envs.bullet.cartpole_bullet import CartPoleBulletEnv from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize env = TfEnv(normalize(GymEnv("CartPoleBulletEnv-v0"))) policy = GaussianMLPPolicy( name="tf_gaussian_mlp", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, )) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=50, discount=0.999, step_size=0.01, force_batch_sampler=True, # Uncomment both lines (this and the plot parameter below) to enable plotting #plot=True,