def run_task(*_): """ Wrap PPO training task in the run_task function. :param _: :return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train()
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def run_task(*_): sess = tf.Session() sess.__enter__() snapshot = joblib.load(latent_policy_pkl) latent_policy = snapshot["policy"] inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3), control_method="position_control", completion_bonus=30) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy( name="policy", env_spec=env, hidden_sizes=(64, 64), init_std=20, # std_share_network=False, # adaptive_std=True ) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=100, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, ) algo.train(sess=sess)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" logger._tensorboard = TensorBoardOutput() env = TfEnv(normalize(gym.make("Pendulum-v0"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=10, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > -1000
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=1000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, use_mpc_es=True, ) algo.train(sess=sess)
def run_task(v): v = SimpleNamespace(**v) # Environment env = SimpleReacherEnv( goal_position=GOALS[0], control_method="position_control", # control_cost_coeff=1.0, action_scale=0.04, randomize_start_jpos=True, completion_bonus=0.1, # terminate_on_collision=True, collision_penalty=0.0, ) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="Policy", env_spec=env.spec, hidden_sizes=(64, 64), std_share_network=True, init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64)), ) # baseline = CollisionAwareBaseline( # env_spec=env.spec, # regressor_args=dict(hidden_sizes=(64, 64)), # ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=10000, discount=0.99, step_size=0.2, policy_ent_coeff=0., optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train()
def run_task(*_): with LocalRunner() as runner: env = PointEnv(goal=(3, 3), random_start=True) env = TfEnv(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, use_mpc_es=True, ) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=1024, plot=True)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('MemorizeDigits-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64, 64), conv_filter_sizes=(5, 3, 2), conv_strides=(4, 2, 1), conv_pad='VALID', hidden_sizes=(256, )) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64, 64), filter_dims=(5, 3, 2), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=2048)
def test_ppo_pendulum_recurrent(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalRunner() as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with TFTrainer(snapshot_config) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler( agents=gru_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def run_task(v): v = SimpleNamespace(**v) with LocalRunner() as runner: # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=v.max_path_length, discount=0.99, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=v.batch_size, plot=False)
def test_ppo_pendulum_flatten_input(self): """Test PPO with CartPole to test observation flattening.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv( normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2)))) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def test_rl2_sampler_less_envs_than_meta_batch(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size // 2)) runner._start_worker() assert isinstance(runner._sampler, RL2Sampler) assert runner._sampler._envs_per_worker == 1 all_indices = np.arange(self.meta_batch_size) for i in range(self.meta_batch_size // 2): assert all(runner._sampler._vec_envs_indices[i] == all_indices[i * 2:i * 2 + 2]) paths = runner._sampler.obtain_samples(0) assert len(paths) == self.meta_batch_size assert len(paths[0]['observations']) == self.max_path_length paths = runner._sampler.obtain_samples( 0, self.meta_batch_size * 10 * self.max_path_length) assert len(paths) == self.meta_batch_size * 10 assert len(paths[0]['observations']) == self.max_path_length
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def ppo_cmb(env, seed, log_dir): """Create test continuous mlp baseline on ppo. Args: env (gym_env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: training results in csv format. """ deterministic.set_seed(seed) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_proc, inter_op_parallelism_threads=num_proc) sess = tf.compat.v1.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=num_proc) as runner: env = TfEnv(normalize(env)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=policy_params['policy_hidden_sizes'], hidden_nonlinearity=policy_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=baseline_params['regressor_args'], ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=algo_params['max_path_length'], discount=algo_params['discount'], gae_lambda=algo_params['gae_lambda'], lr_clip_range=algo_params['lr_clip_range'], entropy_method=algo_params['entropy_method'], policy_ent_coeff=algo_params['policy_ent_coeff'], optimizer_args=algo_params['optimizer_args'], center_adv=algo_params['center_adv'], stop_entropy_gradient=True) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=algo_params['n_envs'])) runner.train(n_epochs=algo_params['n_epochs'], batch_size=algo_params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_ppo_pendulum_gru_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalTFRunner(sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianGRUPolicyWithModel(env_spec=env.spec, ) baseline = GaussianMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_episode_length=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000, max_episode_length=100): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. max_episode_length (int): Max number of timesteps in an episode. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize( GymEnv('MemorizeDigits-v0', is_image=True, max_episode_length=max_episode_length)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True) # yapf: disable algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=1000, batch_size=batch_size)
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = normalize(GymEnv('InvertedDoublePendulum-v2')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def run_garage(env, seed, log_dir): """Create garage model and training. Replace the ppo with the algorithm you want to run. Args: env (gym.Env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.compat.v1.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048) dowel_logger.remove_all() return tabular_log_file
def categorical_cnn_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=hyper_params['conv_filters'], conv_filter_sizes=hyper_params['conv_filter_sizes'], conv_strides=hyper_params['conv_strides'], conv_pad=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict( num_filters=hyper_params['conv_filters'], filter_dims=hyper_params['conv_filter_sizes'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes'], use_trust_region=hyper_params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), flatten_input=False, ) runner.setup(algo, env) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['batch_size'])
def gaussian_cnn_baseline(ctxt, env_id, seed): """Create Gaussian CNN Baseline on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, filters=params['conv_filters'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region']) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=params['n_epochs'], batch_size=params['batch_size'])
def tf_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(normalize(gym.make('Reacher3DOF-v1'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=60, batch_size=2048, plot=False)
def ppo_car(ctxt=None, specs=None): mem_history = [] assert specs is not None set_seed(1) tf.keras.backend.clear_session() with TFTrainer(snapshot_config=ctxt) as trainer: #env = normalize(GymEnv("LunarLanderContinuous-v2")) env = normalize(CarEnv(specs), normalize_obs=True) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), use_trust_region=True, ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=500, is_tf_worker=True) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.07, optimizer_args=dict( batch_size=128, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) trainer.train(n_epochs=300, batch_size=2048, plot=False) trainer.save()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10)) runner.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_exploration_steps'])