def run_task(snapshot_config, *_): """Run task.""" global l_pre_init params.l_pre_init = l_pre_init with LocalTFRunner(snapshot_config=snapshot_config) as runner: # env = TfEnv(normalize(MassSpringEnv_OptL_HwAsAction(params), normalize_action=False, normalize_obs=False, normalize_reward=True, reward_alpha=0.1)) env = TfEnv(MassSpringEnv_OptL_HwAsAction(params)) # zip_project(log_dir=runner._snapshotter._snapshot_dir) comp_policy_model = MLPModel( output_dim=1, hidden_sizes=params.comp_policy_network_size, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.nn.tanh, ) mech_policy_model = MechPolicyModel_OptL_FixedHW(params) policy = CompMechPolicy_OptL_HwAsAction( # reused policy of HWasAction name='comp_mech_policy', env_spec=env.spec, comp_policy_model=comp_policy_model, mech_policy_model=mech_policy_model) # baseline = GaussianMLPBaseline( # env_spec=env.spec, # regressor_args=dict( # hidden_sizes=params.baseline_network_size, # hidden_nonlinearity=tf.nn.tanh, # use_trust_region=True, # ), # ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, **params.ppo_algo_kwargs) runner.setup(algo, env) runner.train(**params.ppo_inner_train_kwargs) tf.compat.v1.reset_default_graph()
def run_garage_pytorch(env, seed, log_dir): """Create garage PyTorch VPG model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ env = TfEnv(normalize(env)) deterministic.set_seed(seed) runner = LocalRunner(snapshot_config) policy = PyTorch_GMP(env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PyTorch_VPG(env_spec=env.spec, policy=policy, optimizer=torch.optim.Adam, policy_lr=hyper_parameters['learning_rate'], baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def run_garage(env, seed, log_dir): """Create garage Tensorflow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file
def run_task(*_): env = normalize(gym.make('Pendulum-v0')) policy = DummyPolicy(env_spec=env) baseline = LinearFeatureBaseline(env_spec=env) algo = InstrumentedNOP(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train() env.close()
def setup_method(self): self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': policy, 'optimizer': torch.optim.Adam, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, 'policy_lr': 1e-2 }
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, step_size=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
def categorical_gru_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=488, batch_size=2048)
def setup_method(self): super().setup_method() self.meta_batch_size = 10 self.episode_per_task = 4 self.max_episode_length = 100 self.inner_max_episode_length = (self.max_episode_length * self.episode_per_task) self.tasks = task_sampler.SetTaskSampler( lambda: RL2Env(normalize(GymEnv(HalfCheetahDirEnv())))) self.env_spec = RL2Env( normalize( GymEnv(HalfCheetahDirEnv(), max_episode_length=self.inner_max_episode_length))).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=False)
def test_train(self): with LocalTFRunner(snapshot_config) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=100)
def test_rl2_ppo_ml10(self): # pylint: disable=import-outside-toplevel from metaworld.benchmarks import ML10 ML_train_envs = [ RL2Env(ML10.from_task(task_name)) for task_name in ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ML_train_envs) tasks.grow_pool(self.meta_batch_size) env_spec = ML_train_envs[0].spec policy = GaussianGRUPolicy(env_spec=env_spec, hidden_dim=64, state_include_action=False, name='policy') baseline = LinearFeatureBaseline(env_spec=env_spec) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task) runner.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=self.episode_per_task)) runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size)
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(GymEnv('Adventure-ram-v4')) env2 = normalize(GymEnv('Alien-ram-v4')) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def test_make_sampler_local_sampler(self): with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env, sampler_cls=LocalSampler) assert isinstance(trainer._sampler, LocalSampler) trainer.train(n_epochs=1, batch_size=10)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = gym.make('FetchPush-v1') env = TfEnv(wp.FlattenDictWrapper(env, dict_keys=["observation", "desired_goal"])) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
def run_task(snapshot_config, *_): with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv( normalize(ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def run_task(v): env = normalize(PointEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v['step_size'], # plot=True, ) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def run_task(snapshot_config, *_): with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo=algo, env=env) runner.train(n_epochs=100, batch_size=10000, plot=True)
def run_task(*_): env = TfEnv(normalize(gym.make('MountainCar-v0'))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.max_episode_steps, n_itr=150, discount=0.99, step_size=0.1, plot=True, ) algo.train()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(gym.make('MontezumaRevenge-ram-v0')) policy = GoExplorePolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = GoExplore( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=200, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=120, batch_size=4000)
def vpg_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow VPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TF_VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv'], optimizer_args=dict( learning_rate=hyper_parameters['learning_rate'], )) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def categorical_lstm_policy(ctxt, env_id, seed): """Create Categorical LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = CategoricalLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def run_task(snapshot_config, *_): """Run the job.""" with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup( algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') # MiniGrid-DoorKey-16x16-v0 policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def test_set_plot(self): with LocalTFRunner(snapshot_config) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance(runner._plotter, Plotter), ( 'self.plotter in LocalTFRunner should be set to Plotter.')
def test_make_sampler_ray_sampler(self, ray_session_fixture): del ray_session_fixture assert ray.is_initialized() with LocalTFRunner(snapshot_config) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env, sampler_cls=RaySampler) assert isinstance(runner._sampler, RaySampler) runner.train(n_epochs=1, batch_size=10)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='Pusher3DOF-v1') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32), init_std=10) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=200, batch_size=50*250)