def run_task(vv): env = TfEnv( normalize( GymEnv('HalfCheetah-v1', record_video=False, record_log=False))) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=vv["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_eval_task(vv): # load policy and baseline- Warning: resets the tf graph # also returns the tensorflow session which must be used in the further code policy, baseline, env, sess = eval.load_saved_objects(vv) # fix the mujoco parameters env_class = eval.get_env_class(env) env = TfEnv( normalize( env_class(log_scale_limit=vv["log_scale_limit"], fix_params=True, random_seed=vv['env_param_seed']))) # TODO: maybe adjust log_scale limit of environment algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=30, discount=vv['discount'], step_size=vv["step_size"], ) algo.train(sess=sess)
def run_task(args, *_): metaworld_train_env = benchmark.get_train_tasks() wrapped_train_env = MetaworldWrapper(metaworld_train_env) env = TfEnv(wrapped_train_env) metaworld_test_env = benchmark.get_test_tasks() wrapped_test_env = MetaworldWrapper(metaworld_test_env) test_env = TfEnv(wrapped_test_env) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, test_env=test_env, policy=policy, baseline=baseline, batch_size=20000, # batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def run_task(v): env = TfEnv(normalize(Reacher7DofMultitaskEnvOracle(distractors=True))) # policy = GaussianMLPPolicy( # name="policy", # env_spec=env.spec, # hidden_nonlinearity=tf.nn.relu, # hidden_sizes=(256, 256), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, # policy=policy, policy=None, load_policy= '/home/kevin/maml_rl/data/local/R7DOF-ET-E3.3/R7DOF_ET_E3.3_2018_01_01_14_27_38_0001/itr_-140.pkl', # if you want to use this you need to comment out the definition of policy above baseline=baseline, batch_size=10 * 30, # 400 * 200 we divide this by #envs on every iteration batch_size_expert_traj=100 * 30, max_path_length=30, start_itr=-1, n_itr=201, # actually last iteration number, not total iterations discount=0.99, step_size=0.00, # 0.01 force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), action_noise_train=0.0, action_noise_test=0.1, make_video=True, save_expert_traj_dir=EXPERT_TRAJ_LOCATION_DICT[env_option + ".local_test"], goals_pool_to_load=R7DOF_GOALS_LOCATION, ) algo.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env_class']( fix_goal=vv['fix_goal'], ))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity'], adaptive_std=vv['adaptive_policy_std'] ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True ) algo.train()
def generate_expert_dp(): env = TfEnv(normalize(InvertedPendulumEnv())) policy = GaussianMLPPolicy( name="expert_policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64), std_hidden_sizes=(64, 64), adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=64, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), gae_lambda=0.97, ) with tf.Session() as sess: algo.train(sess=sess) t = rollout(env=env, agent=policy, max_path_length=100, animated=False) print(sum(t['rewards'])) with open('expert_dp.pickle', 'wb') as handle: pickle.dump(policy, handle) while True: rollout(env=env, agent=policy, max_path_length=100, animated=False)
def run_task(v): env = TfEnv(normalize(HalfCheetahEnvOracle())) # policy = GaussianMLPPolicy( # name="policy", # env_spec=env.spec, # hidden_nonlinearity=tf.nn.relu, # hidden_sizes=(256, 256), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, # policy=policy, policy=None, load_policy='/home/rosen/maml_rl/data/local/CH-TRPO-inc/CH_TRPO_inc_2018_08_29_17_04_16_0001/itr_-20.pkl', baseline=baseline, batch_size=400*200, # we divide this by #envs on every iteration batch_size_expert_traj=2000*200, max_path_length=200, start_itr=-1, n_itr=43, # actually last iteration number, not total iterations discount=0.99, step_size=0.01, # 0.01 force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), action_noise_train=0.0, action_noise_test=0.0, make_video=True, save_expert_traj_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"." + mode + "_sparse_1"], goals_pool_to_load=CHEETAH_GOALS_LOC_SPARSE, ) algo.train()
def run_task(*_): env = TfEnv( normalize(GymEnv("Reacher-v1", force_reset=True, record_video=True))) #env = TfEnv(normalize(PusherEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=100 * 500, max_path_length=100, n_itr=200, discount=0.99, step_size=0.01, force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) algo.train()
def run_task(v): env = TfEnv(normalize(ReacherEnvOracleNoise(option='g200nfj', noise=0.0))) # policy = GaussianMLPPolicy( # name="policy", # env_spec=env.spec, # hidden_nonlinearity=tf.nn.relu, # hidden_sizes=(100, 100), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, #policy=policy, policy=None, load_policy= '/home/rosen/maml_rl/data/local/RE-ET-B1/RE_ET_B1_2017_10_09_17_28_33_0001/itr_-20.pkl', baseline=baseline, batch_size=200 * 50, # 100*500, # we divide this by #envs on every iteration batch_size_expert_traj=40 * 50, max_path_length=50, start_itr=-2, n_itr=1000, # actually last iteration number, not total iterations discount=0.99, step_size=0.008, # 0.01 force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), action_noise_train=0.0, action_noise_test=0.1, save_expert_traj_dir=EXPERT_TRAJ_LOCATION_DICT[env_option + ".local.small"], goals_pool_to_load=GOALS_LOCATION, ) algo.train()
def run_task(v): env = TfEnv(normalize(HalfCheetahEnvOracle())) # policy = GaussianMLPPolicy( # name="policy", # env_spec=env.spec, # hidden_nonlinearity=tf.nn.relu, # hidden_sizes=(256, 256), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, # policy=policy, policy=None, load_policy='/home/kevin/maml_rl/data/local/CH-ET-D5.6/CH_ET_D5.6_2017_10_24_18_32_56_0001/itr_-20.pkl', # if you want to use this you need to comment out the definition of policy above baseline=baseline, batch_size=400*200, # we divide this by #envs on every iteration batch_size_expert_traj=20*200, max_path_length=200, start_itr=-1, n_itr=1001, # actually last iteration number, not total iterations discount=0.99, step_size=0.01, # 0.01 force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), action_noise_train=0.0, action_noise_test=0.1, save_expert_traj_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+".local.noise0.1.small"], goals_pool_to_load=CHEETAH_GOALS_LOCATION, ) algo.train()
def run_task(args, *_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO #env = TfEnv(normalize(CartpoleEnv())) env = TfEnv(CartpoleEnv()) #metaworld_env = ML1.get_train_tasks("pick-place-v1") #tasks = metaworld_env.sample_tasks(1) #metaworld_env.set_task(tasks[0]) #metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space) #metaworld_env._action_space = convert_gym_space(metaworld_env.action_space) #env = TfEnv(normalize(metaworld_env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, # batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def run_train_task(vv): env = TfEnv( normalize(vv['env_class']( fix_goal=vv['fix_goal'], reward_type=vv['reward_type'], init_puck_low=INIT_PUCK_TARGET - vv['init_slack'], init_puck_high=INIT_PUCK_TARGET + vv['init_slack'], puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'], puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'], ))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity'], adaptive_std=vv['adaptive_policy_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True) algo.train()
def run_train_task(vv): env = TfEnv( normalize( CassieEnv(fixed_gains=vv['fixed_gains'], stability_cost_coef=vv['stability_cost_coef'], ctrl_cost_coef=vv['ctrl_cost_coef'], alive_bonus=vv['alive_bonus']))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True) algo.train()
def run_task(args, *_): env = TfEnv(normalize( dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def test_multiagent_ngsim_env(self): basedir = os.path.expanduser('~/.julia/packages/NGSIM/9OYUa/data') filename = 'trajdata_i101_trajectories-0750am-0805am.txt' filepaths = [os.path.join(basedir, filename)] n_veh = 5 env = JuliaEnv(env_id='MultiagentNGSIMEnv', env_params=dict(n_veh=n_veh, trajectory_filepaths=filepaths, H=200, primesteps=50), using='AutoEnvs') low, high = env.action_space.low, env.action_space.high env = TfEnv(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32), std_hidden_sizes=(32, 32), adaptive_std=True, output_nonlinearity=None, learn_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, sampler_args=dict(n_envs=n_veh)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) try: algo.train(sess=sess) except Exception as e: self.fail('exception incorrectly raised: {}'.format(e))
def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99): gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=tf_config) as sess: algo = TRPO( env=env, policy=policy, n_itr=3000, batch_size=20000, max_path_length=1000, discount=discount, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec), step_size=0.01, entropy_weight=ent_wt, sess=sess, exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/swimmer'): algo.train(sess)
def test_maml_sampling(self): # get from data # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = MAMLImprovedGaussianMLPPolicy( name="policy3", env_spec=env.spec, hidden_sizes=(100, 100), grad_step_size=0.1, hidden_nonlinearity=tf.nn.tanh, trainable_step_size=False, bias_transform=False ) from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=1) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.meta_batch_size = dynamics_model.num_models algo.batch_size_dynamics_samples = algo.batch_size algo.dynamics_model = dynamics_model itr = 1 model_sampler = MAMLModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr, return_dict=True) samples_data = model_sampler.process_samples(itr, paths[0]) print(samples_data.keys())
def run_linear_ocm_exp(variant): from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy import sandbox.rocky.tf.core.layers as L from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer, FiniteDifferenceHvp, ) from railrl.envs.flattened_product_box import FlattenedProductBox from railrl.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from railrl.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from railrl.envs.memory.high_low import HighLow from railrl.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] set_seed(seed) onehot_dim = num_values + 1 """ Code for running the experiment. """ # env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = HighLow(num_steps=H) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) env = FlattenedProductBox(env) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.LSTMLayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] trpo_params = variant['trpo_params'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **trpo_params) algo.train()
def run_linear_ocm_exp(variant): from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer, FiniteDifferenceHvp, ) from rlkit.envs.flattened_product_box import FlattenedProductBox from rlkit.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from rlkit.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from rlkit.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] set_seed(seed) onehot_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) env = FlattenedProductBox(env) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = variant['optimizer_params'] trpo_params = variant['trpo_params'] algo = TRPO(env=env, policy=policy, baseline=baseline, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(**optimizer_params)), **trpo_params) algo.train()
def test_model_sampling_with_dummy_different_meta_batch_size(self): env = DummyEnv() dynamics_dummy = DummyDynamicsEnsemble("dyn_model4", env, num_models=4) env = TfEnv(normalize(DummyEnv())) policy = MAMLImprovedGaussianMLPPolicy( name="policy4", env_spec=env.spec, hidden_sizes=(100, 100), grad_step_size=0.1, hidden_nonlinearity=tf.nn.tanh, trainable_step_size=False, bias_transform=False ) from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.meta_batch_size = dynamics_dummy.num_models * 2 algo.batch_size_dynamics_samples = algo.batch_size algo.dynamics_model = dynamics_dummy itr = 1 model_sampler = MAMLModelVectorizedSampler(algo) model_sampler.start_worker() paths = model_sampler.obtain_samples(itr, return_dict=True) n_steps_per_model = np.array( [np.sum([path['observations'].shape[0] for path in model_paths]) for model_paths in paths.values()]) self.assertTrue( all(np.abs(n_steps_per_model - algo.batch_size // algo.meta_batch_size) <= algo.max_path_length)) for i in range(dynamics_dummy.num_models): for path in paths[i]: self.assertTrue( (np.logical_or(path['observations'] == 1.0, path['observations'] == i//2 * 0.01)).all())
def test_model_sampling_with_given_traj_starting_obs(self): env = DummyEnv() dynamics_dummy = DummyDynamicsEnsemble("dyn_model4", env, num_models=4) env = TfEnv(normalize(DummyEnv())) policy = MAMLImprovedGaussianMLPPolicy( name="policy4", env_spec=env.spec, hidden_sizes=(100, 100), grad_step_size=0.1, hidden_nonlinearity=tf.nn.tanh, trainable_step_size=False, bias_transform=False ) from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.meta_batch_size = dynamics_dummy.num_models * 2 algo.batch_size_dynamics_samples = algo.batch_size algo.dynamics_model = dynamics_dummy itr = 1 model_sampler = MAMLModelVectorizedSampler(algo) model_sampler.start_worker() traj_starting_obs = np.array([[-1, -1],[-0.5, -0.5]]) paths = model_sampler.obtain_samples(itr, return_dict=True, traj_starting_obs=traj_starting_obs) for i in range(dynamics_dummy.num_models): for path in paths[i]: print(path['observations'][0]) print(np.abs(np.mean(path['observations'][0]) + 1.0) < 0.001) self.assertTrue( np.abs(np.mean(path['observations'][0]) + 1.0) < 0.001 or np.abs(np.mean(path['observations'][0])+0.5) < 0.001)
def test_random_sampling(self): # get from data env = PointEnv() paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100) dynamics_model = MLPDynamicsModel("dyn_model2", env, hidden_sizes=(16,16)) obs = np.concatenate([path['observations'] for path in paths], axis=0) obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0) act = np.concatenate([path['actions'] for path in paths], axis=0) env = TfEnv(normalize(PointEnv())) policy = GaussianMLPPolicy( name="policy2", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) # fit dynamics model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dynamics_model.fit(obs, act, obs_next, epochs=5) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, ) algo.dynamics_model = dynamics_model itr = 1 random_sampler = RandomVectorizedSampler(algo) random_sampler.start_worker() paths = random_sampler.obtain_samples(itr) samples_data = random_sampler.process_samples(itr, paths) self.assertTrue(set(samples_data.keys()) >= set(['actions_dynamics', 'next_observations_dynamics', 'observations_dynamics']))
def get_algo(env_name, use_eval, init_path, horizon, batch_size, n_itr, discount, step_size, gae): env = get_env(env_name) policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32), # output_nonlinearity=tf.nn.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) kwargs = dict(env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=horizon, n_itr=n_itr, discount=discount, step_size=step_size, gae_lambda=gae) if use_eval: kwargs["reset_init_path"] = os.path.join(config.PROJECT_PATH, get_eval_data_path[env_name]) kwargs["horizon"] = horizon if init_path is not None: kwargs["initialized_path"] = init_path return TRPO(**kwargs)
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO( env=env, policy=policy, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/pendulum'): algo.train()
def main(): env = TfEnv(CustomGymEnv('PointMazeRight-v0')) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO( env=env, policy=policy, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_trpo'): algo.train()
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO(env=env, policy=policy, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum'): algo.train()
def main(env_name, n_itr, batch_size, max_path_length): env_id = env_names_to_ids[env_name] env = TfEnv(GymEnv(env_id, record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO(env=env, policy=policy, n_itr=n_itr, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname=DATA_DIR + '/' + env_name): algo.train()
def run_experiment(params): params_base = copy.copy(DEFAULTS) params_base.update(params) params = params_base policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings("embeddings", len(VOCAB), params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, len(VOCAB), params["embedding_dim"], has_other_input=False), state_include_action=False, ) baseline = LinearFeatureBaseline(env.spec) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=LENGTH, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="autoenc_unnorm_reward", variant=params, )
def opt_trpo(env, baseline, policy, **kwargs): return TRPO(env=env, policy=policy, baseline=baseline, discount=0.99, step_size=0.001, batch_size=4000, n_itr=int(1e9), **kwargs)
def run_task(v): if local: #xml_filepath = DOCKER_CODE_DIR + 'vendor/local_mujoco_models/pusher' + str(v['seed']) + '.xml' xml_filepath = DOCKER_CODE_DIR + 'vendor/local_mujoco_models/ensure_woodtable_distractor_pusher' + str( v['seed']) + '.xml' else: xml_filepath = DOCKER_CODE_DIR + 'vendor/mujoco_models/ensure_woodtable_distractor_pusher' + str( v['seed']) + '.xml' exp_log_info = {'xml': xml_filepath} gym_env = PusherEnv( xml_file=xml_filepath ) #**{'xml_file': xml_filepath}) #, 'distractors': True}) #gym_env = GymEnv('Pusher-v0', force_reset=True, record_video=False) # TODO - this is hacky... #mujoco_env.MujocoEnv.__init__(gym_env.env.env.env, xml_filepath, 5) env = TfEnv(normalize(gym_env)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(128, 128)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, #load_policy='/home/kevin/rllab_copy/data/local/rllab-fixed-push-experts/pretraining_policy3/itr_300.pkl', #load_policy='vendor/pretraining_policy3/itr_300.pkl', baseline=baseline, batch_size=100 * 500, max_path_length=100, n_itr=301, discount=0.99, step_size=0.01, force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) exp_log_info=exp_log_info, ) algo.train()
def run_experiment(params): params_base = copy.copy(DEFAULTS) params_base.update(params) params = params_base policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "embeddings", len(VOCAB), params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, len(VOCAB), params["embedding_dim"], has_other_input=False), state_include_action=False, ) baseline = LinearFeatureBaseline(env.spec) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=LENGTH, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="autoenc_unnorm_reward", variant=params, )
def run_task(args, *_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO metaworld_env = ML1.get_train_tasks( 'pick-place-v1') # Create an environment with task `pick_place` tasks = metaworld_env.sample_tasks( 1) # Sample a task (in this case, a goal variation) metaworld_env.set_task(tasks[0]) # Set task # print(metaworld_env.id) # print("HERE") # import pdb;pdb.set_trace() metaworld_env = GymEnv2(metaworld_env) # metaworld_env.observation_space = convert_gym_space(metaworld_env.observation_space) # metaworld_env.action_space = convert_gym_space(metaworld_env.action_space) # env = metaworld_env env = TfEnv(normalize(metaworld_env)) # Cannot be solved easily by TRPO policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=10000, discount=0.99, step_size=0.01, force_batch_sampler=True, # for TF # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--enable_obsnorm', action='store_true', default=False) parser.add_argument('--chunked', action='store_true', default=False) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--anneal_step_size', type=int, default=0) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--radius', type=float, default=0.015) parser.add_argument('--n_evaders', type=int, default=10) parser.add_argument('--n_pursuers', type=int, default=8) parser.add_argument('--n_poison', type=int, default=10) parser.add_argument('--n_coop', type=int, default=4) parser.add_argument('--n_sensors', type=int, default=30) parser.add_argument('--sensor_range', type=str, default='0.2') parser.add_argument('--food_reward', type=float, default=5) parser.add_argument('--poison_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.05) parser.add_argument('--reward_mech', type=str, default='local') parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help='Whether to only print the tabular log information (in a horizontal format)') args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) if len(sensor_range) == 1: sensor_range = sensor_range[0] else: assert sensor_range.shape == (args.n_pursuers,) env = MAWaterWorld(args.n_pursuers, args.n_evaders, args.n_coop, args.n_poison, radius=args.radius, n_sensors=args.n_sensors, food_reward=args.food_reward, poison_reward=args.poison_reward, encounter_reward=args.encounter_reward, reward_mech=args.reward_mech, sensor_range=sensor_range, obstacle_loc=None) env = TfEnv( RLLabEnv( StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=args.enable_obsnorm), mode=args.control)) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim,), output_dim=16, hidden_sizes=(128, 64, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden_sizes), name='policy') else: policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))), min_std=10e-5) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'mlp': raise NotImplementedError() # baseline = GaussianMLPBaseline( # env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(',')))) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, #max_path_length_limit=args.max_path_length_limit, update_max_path_length=args.update_curriculum, anneal_step_size=args.anneal_step_size, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) if args.recurrent else None, mode=args.control if not args.chunked else 'chunk_{}'.format(args.control),) algo.train()
) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: for env_name, env in envs: logger.log("Training Policy on %s" % env_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=env.horizon, n_itr=args.num_epochs, discount=0.99, step_size=args.step_size, optimizer=ConjugateGradientOptimizer(reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)) ) custom_train(algo, sess=sess) rollouts = algo.obtain_samples(args.num_epochs + 1) logger.log("Average reward for training rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]), np.std([np.sum(p['rewards']) for p in rollouts]))) # Final evaluation on all environments using the learned policy total_rollouts = []
from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite env = TfEnv(normalize(CartpoleEnv())) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) algo.train()