def test_make_sampler_ray_sampler(self, ray_session_fixture): del ray_session_fixture assert ray.is_initialized() with TFTrainer(snapshot_config) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) assert isinstance(trainer._sampler, RaySampler) trainer.train(n_epochs=1, batch_size=10)
def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml1 = metaworld.ML1('push-v1') tasks = MetaWorldTaskSampler(ml1, 'train') env = tasks.sample(1)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml1, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, n_test_tasks=1, n_exploration_eps=rollouts_per_task) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=rollouts_per_task * env.spec.max_episode_length)
def trpo_swimmer(ctxt=None, seed=1, batch_size=4000): """Train TRPO with Swimmer-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=batch_size)
def test_ray_batch_sampler(self): sampler1 = RaySampler(self.algo, self.env, seed=100, num_processors=1, sampler_worker_cls=SamplerWorker) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 16) trajs2 = sampler2.obtain_samples(0, 1) assert (trajs1[0]['observations'].shape == np.array( trajs2[0]['observations']).shape == (6, 16)) traj2_action_shape = np.array(trajs2[0]['actions']).shape assert (trajs1[0]['actions'].shape == traj2_action_shape == (6, 4)) assert (sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1) true_obs = np.array( [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]) true_actions = np.array([[0., 0., 1., 0.], [0., 0., 1., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.]]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) for trajectory in trajs1: assert (np.array_equal(trajectory['observations'], true_obs)) assert (np.array_equal(trajectory['actions'], true_actions)) assert (np.array_equal(trajectory['rewards'], true_rewards)) sampler1.shutdown_worker() sampler2.shutdown_worker()
def test_ray_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = RaySampler(workers, self.policy, self.env, num_processors=1) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert (trajs1[0]['observations'].shape == np.array( trajs2[0]['observations']).shape == (6, )) traj2_action_shape = np.array(trajs2[0]['actions']).shape assert trajs1[0]['actions'].shape == traj2_action_shape == (6, ) assert sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1 true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) for trajectory in trajs1: assert np.array_equal(trajectory['observations'], true_obs) assert np.array_equal(trajectory['actions'], true_actions) assert np.array_equal(trajectory['rewards'], true_rewards) sampler1.shutdown_worker() sampler2.shutdown_worker()
def reps_gym_cartpole(ctxt=None, seed=1): """Train REPS with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000, plot=False)
def vpg_cartpole(ctxt=None, seed=1): """Train VPG with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=10000)
def test_ray_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = RaySampler(workers, self.policy, self.env) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker()
def bc_point(ctxt=None, loss='log_prob'): """Run Behavioral Cloning on garage.envs.PointEnv. Args: ctxt (ExperimentContext): Provided by wrap_experiment. loss (str): Either 'log_prob' or 'mse' """ trainer = Trainer(ctxt) goal = np.array([1., 1.]) env = PointEnv(goal=goal, max_episode_length=200) expert = OptimalPolicy(env.spec, goal=goal) policy = GaussianMLPPolicy(env.spec, [8, 8]) batch_size = 1000 sampler = RaySampler(agents=expert, envs=env, max_episode_length=env.spec.max_episode_length) algo = BC(env.spec, policy, batch_size=batch_size, source=expert, sampler=sampler, policy_lr=1e-2, loss=loss) trainer.setup(algo, env) trainer.train(100, batch_size=batch_size)
def test_ray_batch_sampler(self, ray_local_session_fixture): del ray_local_session_fixture assert ray.is_initialized() workers = WorkerFactory( seed=100, max_episode_length=self.algo.max_episode_length) sampler1 = RaySampler(workers, self.policy, self.env) sampler1.start_worker() sampler1.shutdown_worker()
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 50. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 50 == 0 assert n_tasks <= 2500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def ppo_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = TF_GMB( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=3e-4, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TF_PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10, learning_rate=3e-4, verbose=True)) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def ppo_garage_pytorch(ctxt, env_id, seed): """Create garage PyTorch PPO model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) env = normalize(GymEnv(env_id)) policy = PyTorch_GMP(env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, minibatch_size=64) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PyTorch_PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def gaussian_cnn_baseline(ctxt, env_id, seed): """Create Gaussian CNN Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region']) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=params['n_epochs'], batch_size=params['batch_size'])
def test_init_without_worker_factory(ray_local_session_fixture): del ray_local_session_fixture assert ray.is_initialized() max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) sampler = RaySampler(agents=policy, envs=env, seed=100, max_episode_length=max_episode_length) worker_factory = WorkerFactory(seed=100, max_episode_length=max_episode_length) assert sampler._worker_factory._seed == worker_factory._seed assert (sampler._worker_factory._max_episode_length == worker_factory._max_episode_length) with pytest.raises(TypeError, match='Must construct a sampler from'): RaySampler(agents=policy, envs=env)
def ppo_car(ctxt=None, specs=None): mem_history = [] assert specs is not None set_seed(1) tf.keras.backend.clear_session() with TFTrainer(snapshot_config=ctxt) as trainer: #env = normalize(GymEnv("LunarLanderContinuous-v2")) env = normalize(CarEnv(specs), normalize_obs=True) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=500, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.07, optimizer_args=dict( batch_size=128, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=300, batch_size=2048, plot=False) trainer.save()
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def trpo_cubecrash(ctxt=None, seed=1, max_episode_length=5, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize( GymEnv('CubeCrash-v0', max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=batch_size)
def categorical_mlp_policy(ctxt, env_id, seed): """Create Categorical MLP Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), name='CategoricalMLPPolicyBenchmark') trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(GymEnv('Adventure-ram-v4')) env2 = normalize(GymEnv('Alien-ram-v4')) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def vpg_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow VPG model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TF_VPG(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], center_adv=hyper_parameters['center_adv'], optimizer_args=dict( learning_rate=hyper_parameters['learning_rate'], )) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def trpo_garage_tf(ctxt, env_id, seed): """Create garage Tensorflow TROI model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], max_kl_step=hyper_parameters['max_kl']) trainer.setup(algo, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def trpo_swimmer_ray_sampler(ctxt=None, seed=1): """tf_trpo_swimmer. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ # Since this is an example, we are running ray in a reduced state. # One can comment this line out in order to run ray at full capacity ray.init(_memory=52428800, object_store_memory=78643200, ignore_reinit_error=True, log_to_driver=False, include_dashboard=False) with TFTrainer(snapshot_config=ctxt) as trainer: set_seed(seed) env = GymEnv('Swimmer-v2') policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=40, batch_size=4000)
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = GymEnv('InvertedDoublePendulum-v2') trainer = Trainer(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, center_adv=False) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=10000)
def trpo_cartpole_recurrent(ctxt, seed, n_epochs, batch_size, plot): """Train TRPO with a recurrent policy on CartPole. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. n_epochs (int): Number of epochs for training. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Batch size used for training. plot (bool): Whether to plot or not. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1', max_episode_length=100) policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=plot)
def trpo_cartpole_bullet(ctxt=None, seed=1): """Train TRPO with Pybullet's CartPoleBulletEnv environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = BulletEnv( gym.make('CartPoleBulletEnv-v1', renders=False, discrete_actions=True)) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000)
def test_ray_batch_sampler(ray_local_session_fixture): del ray_local_session_fixture env = TfEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16) assert ray.is_initialized() workers = WorkerFactory(seed=100, max_path_length=algo.max_path_length) sampler1 = RaySampler(workers, policy, env) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(algo, env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()