def test_gaussian_mlp_policy(self): gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) obs = self.env.observation_space.high assert gaussian_mlp_policy.get_action(obs)
def test_get_action(self, mock_normal, obs_dim, action_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, prob = policy.get_action(obs) expected_action = np.full(action_dim, 0.75) expected_mean = np.full(action_dim, 0.5) expected_log_std = np.full(action_dim, np.log(0.5)) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std) actions, probs = policy.get_actions([obs, obs, obs]) for action, mean, log_std in zip(actions, probs['mean'], probs['log_std']): assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim with tf.compat.v1.variable_scope('GaussianMLPPolicy/GaussianMLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs[:-1], feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs[:-1], feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def trpo_swimmer(ctxt=None, seed=1, batch_size=4000): """Train TRPO with Swimmer-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=batch_size)
def test_process_samples_continuous_non_recurrent(self): env = TfEnv(DummyBoxEnv()) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyBoxEnv # never terminate until it reaches max_path_length, batch size # must be max_path_length, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.flat_dim) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is only 1 path assert samples['lengths'].shape == (1, ) # non-recurrent policy has empty agent info assert samples['agent_infos'] == {} # DummyBoxEnv has env_info dummy assert samples['env_infos']['dummy'].shape == (max_path_length, ) assert isinstance(samples['average_return'], float)
def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockTFAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = MetaRLEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = MetaRLEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def trpo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64), use_trust_region=False), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def run_task(snapshot_config, *_): """Run the job. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def vpgis_inverted_pendulum(ctxt=None, seed=1): """Train TRPO with InvertedPendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=40, batch_size=4000)
def trpo_mt50(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvWrapper(MT50_envs, env_ids, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=len(MT50_envs)*10*150)
def test_rl2_sampler_less_envs_than_meta_batch(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size // 2)) runner._start_worker() assert isinstance(runner._sampler, RL2Sampler) assert runner._sampler._envs_per_worker == 1 all_indices = np.arange(self.meta_batch_size) for i in range(self.meta_batch_size // 2): assert all(runner._sampler._vec_envs_indices[i] == all_indices[i * 2:i * 2 + 2]) paths = runner._sampler.obtain_samples(0) assert len(paths) == self.meta_batch_size assert len(paths[0]['observations']) == self.max_path_length paths = runner._sampler.obtain_samples( 0, self.meta_batch_size * 10 * self.max_path_length) assert len(paths) == self.meta_batch_size * 10 assert len(paths[0]['observations']) == self.max_path_length
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def gaussian_mlp_policy(ctxt, env_id, seed): """Create Gaussian MLP Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def tf_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.002, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=4096, plot=False)
def setup_method(self): super().setup_method() self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), )
def ppo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=30, max_epochs=4, tf_optimizer_args=dict(learning_rate=3e-4, ), ), ) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def test_dist_info_sym(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) dist1_sym = policy.dist_info_sym(obs_ph, name='p1_sym') # flatten output expected_mean = [np.full(np.prod(action_dim), 0.5)] expected_log_std = [np.full(np.prod(action_dim), np.log(0.5))] prob = self.sess.run(dist1_sym, feed_dict={obs_ph: [obs.flatten()]}) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def test_is_pickleable(self, obs_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = GaussianMLPPolicy(env_spec=env.spec) policy.build(obs_var) obs = env.reset() with tf.compat.v1.variable_scope('GaussianMLPPolicy/GaussianMLPModel', reuse=True): bias = tf.compat.v1.get_variable( 'dist_params/mean_network/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run( [policy.distribution.loc, policy.distribution.stddev()], feed_dict={policy.model.input: [[obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy_pickled = pickle.loads(p) policy_pickled.build(obs_var) output2 = sess.run( [ policy_pickled.distribution.loc, policy_pickled.distribution.stddev() ], feed_dict={policy_pickled.model.input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def run_task(self, snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = gym.make(self._env) env = TfEnv(normalize(env)) env.reset() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def ppo_mt10_sampling(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvSamplingWrapper(MT10_envs, env_ids, len(env_ids)-skip_size, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict( learning_rate=3e-4, ), ), ) batch_size = (len(env_ids)-skip_size)*10*150 epochs = (total_steps//batch_size)+10 print ("epochs:", epochs, "batch_size:", batch_size) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_webui=False) with LocalTFRunner(snapshot_config=ctxt) as runner: set_seed(seed) env = MetaRLEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def test_rl2_sampler_invalid_num_of_env_again(self): with pytest.raises( ValueError, match='n_envs must be a multiple of meta_batch_size'): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size + 1)) runner._start_worker() runner._sampler.obtain_samples(0)
def trpo_metarl_tf(ctxt, env_id, seed): """Create metarl Tensorflow TROI model and training. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_tnpg_inverted_pendulum(self): """Test TNPG with InvertedPendulum-v2 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TNPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(reg_coeff=5e-1)) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 15 env.close()
def test_ppo_pendulum_continuous_baseline(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def test_get_action(self, obs_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = GaussianMLPPolicy(env_spec=env.spec) policy.build(obs_var) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)