def test_param_space_noise(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy(name="policy33", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, param_noise_std=0.0) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=3, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertAlmostEquals(diff, 0.0) action_1 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] action_2 = policy.get_action(obs, param_noise_std=1.0)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1) policy.param_noise_std = 1.0 action_1 = policy.get_action(obs)[1]['mean'] action_2 = policy.get_action(obs)[1]['mean'] diff = np.sum((action_1 - action_2)**2) self.assertGreaterEqual(diff, 0.1)
def test_serialization(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy( name="policy56", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) import rllab.misc.logger as logger logger.set_snapshot_dir('/tmp/') logger.set_snapshot_mode('last') algo = MAMLTRPO( env=env, policy=policy, baseline=baseline, batch_size=2, max_path_length=10, meta_batch_size=4, num_grad_updates=1, n_itr=1, discount=0.99, step_size=0.01, ) algo.train() tf.reset_default_graph() pkl_file = os.path.join('/tmp/', 'params.pkl') with tf.Session() as sess: data = joblib.load(pkl_file) policy = data['policy'] action_before = policy.get_action(obs)[1]['mean'] dump_string = pickle.dumps(policy) tf.reset_default_graph() with tf.Session() as sess: policy_loaded = pickle.loads(dump_string) action_after = policy_loaded.get_action(obs)[1]['mean'] diff = np.sum(np.abs(action_before - action_after)) self.assertAlmostEquals(diff, 0.0, places=3)
def test_get_mean_stepsize(self): env = TfEnv(normalize(PointEnvMAML())) obs = env.reset() policy = MAMLImprovedGaussianMLPPolicy(name="policy2", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh, trainable_step_size=True, grad_step_size=0.7) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) mean_stepsize_1 = policy.get_mean_step_size() self.assertAlmostEquals(mean_stepsize_1, 0.7, places=5)
exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: goal = list(goal) if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(PointEnvRandGoalOracle(goal=goal)) n_itr = 1 else: env = normalize(PointEnvRandGoal(goal=goal)) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, load_policy=initial_params_file, baseline=baseline,
from sandbox_maml.rocky.tf.algos.trpo import TRPO from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab_maml.envs.box2d.cartpole_env import CartpoleEnv from rllab_maml.envs.normalized_env import normalize from sandbox_maml.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy from sandbox_maml.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy from sandbox_maml.rocky.tf.envs.base import TfEnv import sandbox_maml.rocky.tf.core.layers as L from sandbox_maml.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab_maml.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianLSTMPolicy( name="policy", env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10,
# should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: direc = v['direc'] learning_rate = v['meta_step_size'] if direc: env = TfEnv(normalize(HalfCheetahEnvRandDirec())) else: env = TfEnv(normalize(HalfCheetahEnvRand())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline,
return [2] # should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: task_var = v['task_var'] if task_var == 0: env = TfEnv(normalize(AntEnvRandDirec())) task_var = 'direc' elif task_var == 1: env = TfEnv(normalize(AntEnvRand())) task_var = 'vel' elif task_var == 2: env = TfEnv(normalize(AntEnvRandGoal())) task_var = 'pos' policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), )
baselines = ['linear'] fast_batch_size = 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 40 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True interpreter_path = sys.executable for fast_learning_rate in fast_learning_rates: for learning_rate in learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
fast_learning_rates = [0.1] baselines = ['linear'] fast_batch_size = 20 meta_batch_size = 60 max_path_length = 10 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True for fast_learning_rate in fast_learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(GridWorldEnvRand('four-state'))) policy = MAMLCategoricalMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml = True for v in variants: task_var = v['task_var'] oracle = v['oracle'] if task_var == 0: task_var = 'direc' exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvDirecOracle())) else: env = TfEnv(normalize(AntEnvRandDirec())) elif task_var == 1: task_var = 'vel' exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvOracle())) else: env = TfEnv(normalize(AntEnvRand())) elif task_var == 2: task_var = 'pos' exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvRandGoalOracle())) else: