def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) rtn = runner.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def run_task(snapshot_config, *_): """Run the job. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=40, batch_size=4000)
def reps_gym_cartpole(ctxt=None, seed=1): """Train REPS with CartPole-v0 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MetaRLEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=False)
def fixture_exp(snapshot_config, sess): """Dummy fixture experiment function. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. sess (tf.Session): An optional TensorFlow session. A new session will be created immediately if not provided. Returns: np.ndarray: Values of the parameters evaluated in the current session """ with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def trpois_inverted_pendulum(ctxt=None, seed=1): """Train TRPO on InvertedPendulum-v2 with importance sampling. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def test_rl2_sampler_less_envs_than_meta_batch(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size // 2)) runner._start_worker() assert isinstance(runner._sampler, RL2Sampler) assert runner._sampler._envs_per_worker == 1 all_indices = np.arange(self.meta_batch_size) for i in range(self.meta_batch_size // 2): assert all(runner._sampler._vec_envs_indices[i] == all_indices[i * 2:i * 2 + 2]) paths = runner._sampler.obtain_samples(0) assert len(paths) == self.meta_batch_size assert len(paths[0]['observations']) == self.max_path_length paths = runner._sampler.obtain_samples( 0, self.meta_batch_size * 10 * self.max_path_length) assert len(paths) == self.meta_batch_size * 10 assert len(paths[0]['observations']) == self.max_path_length
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def rl2_ppo_metaworld_ml1_push(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=mwb.ML1.get_train_tasks('push-v1'))) env_spec = RL2Env(env=mwb.ML1.get_train_tasks('push-v1')).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def maml_trpo_metaworld_ml45(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv( normalize(mwb.ML45.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 test_task_names = mwb.ML45.get_test_tasks().all_task_names test_tasks = [ MetaRLEnv( normalize(mwb.ML45.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def run_metarl(env, seed, log_dir): """Create metarl PyTorch MAML model and training. Args: env (MetaRLEnv): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], meta_batch_size=hyper_parameters['meta_batch_size'], inner_lr=hyper_parameters['inner_lr'], max_kl_step=hyper_parameters['max_kl'], num_grad_updates=hyper_parameters['num_grad_update']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) snapshot_config = SnapshotConfig(snapshot_dir=log_dir, snapshot_mode='all', snapshot_gap=1) runner = LocalRunner(snapshot_config=snapshot_config) runner.setup(algo, env, sampler_args=dict(n_envs=5)) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=(hyper_parameters['fast_batch_size'] * hyper_parameters['max_path_length'])) dowel_logger.remove_all() return tabular_log_file
def setup_method(self): super().setup_method() self.max_path_length = 100 self.meta_batch_size = 10 self.episode_per_task = 4 self.tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=normalize(HalfCheetahDirEnv()))) self.env_spec = RL2Env(env=normalize(HalfCheetahDirEnv())).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048) dowel_logger.remove_all() return tabular_log_file
def rl2_trpo_halfcheetah(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train TRPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def run_metarl_tf(env, seed, log_dir): """Create metarl TensorFlow VPG model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TF_VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv'], optimizer_args=dict( tf_optimizer_args=dict( learning_rate=hyper_parameters['learning_rate']), verbose=True)) # yapf: disable # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def run_metarl_pytorch(env, seed, log_dir): """Create metarl PyTorch PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ env = TfEnv(normalize(env)) deterministic.set_seed(seed) runner = LocalRunner(snapshot_config) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, baseline=baseline, optimizer=torch.optim.Adam, policy_lr=hyper_parameters['learning_rate'], max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], center_adv=hyper_parameters['center_adv'], lr_clip_range=hyper_parameters['lr_clip_range']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def setup_method(self): """Setup method which is called before every test.""" self._env = MetaRLEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, }
def run_task(*_): env = normalize(gym.make('Pendulum-v0')) policy = DummyPolicy(env_spec=env) baseline = LinearFeatureBaseline(env_spec=env) algo = InstrumentedNOP(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train() env.close()
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=False)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
def test_train(self): with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=100)
def categorical_gru_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def test_make_sampler_ray_sampler(self): with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env, sampler_cls=RaySampler) assert isinstance(runner._sampler, RaySampler) runner.train(n_epochs=1, batch_size=10)
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = MetaRLEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = MetaRLEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def fixture_exp(snapshot_config, sess): with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) runner.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def trpo_cartpole_batch_sampler(ctxt=None, seed=1, batch_size=4000, max_path_length=100): """Train TRPO with CartPole-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. max_path_length (int): Number of timesteps to truncate paths to. """ set_seed(seed) n_envs = batch_size // max_path_length with LocalTFRunner(ctxt, max_cpus=n_envs) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, max_kl_step=0.01) runner.setup(algo=algo, env=env, sampler_cls=BatchSampler, sampler_args={'n_envs': n_envs}) runner.train(n_epochs=100, batch_size=4000, plot=False)