def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = TRPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_path_length=100, discount=0.99, gae_lambda=0.98) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def run_garage(env, seed, log_dir): ''' Create garage model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalRunner() as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], n_epoch_cycles=params['n_epoch_cycles'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def setup_method(self): """Setup method which is called before every test.""" self._env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'value_function': GaussianMLPValueFunction(env_spec=self._env.spec), 'discount': 0.99, }
def test_resume(self): with LocalRunner(self.snapshot_config, self.sess) as runner: args = runner.restore(self.temp_dir.name) assert np.equal( runner.policy.get_param_values(), self.policy_params).all(), 'Policy parameters should persist' assert args.n_epochs == 5, ( 'Snapshot should save training parameters') assert args.start_epoch == 5, ( 'Last experiment should end at 5th iterations') batch_size = runner.train_args.batch_size n_epoch_cycles = runner.train_args.n_epoch_cycles runner.resume(n_epochs=10, plot=False, store_paths=True, pause_for_plot=False) assert runner.train_args.n_epochs == 10 assert runner.train_args.batch_size == batch_size assert runner.train_args.n_epoch_cycles == n_epoch_cycles assert not runner.train_args.plot assert runner.train_args.store_paths assert not runner.train_args.pause_for_plot
def test_ppo_pendulum_recurrent(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalRunner() as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 10 algo = CEM( env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) rtn = runner.train( n_epochs=5, batch_size=2000, n_epoch_cycles=n_samples) assert rtn > 40 env.close()
def run_task(*_): with LocalRunner() as runner: env = TfEnv( normalize( OneHotMultiTaskEnv(task_env_cls=PointEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def run_task(v): v = SimpleNamespace(**v) with LocalRunner() as runner: # Environment env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5) env = TfEnv(env) # Policy policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 32), init_std=v.policy_init_std, ) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, max_path_length=v.max_path_length, discount=0.99, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=v.batch_size, plot=False)
def run_task(*_): sess = tf.Session() sess.__enter__() latent_policy = joblib.load(latent_policy_pkl)["policy"] with LocalRunner(sess=sess) as runner: inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100) env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy)) policy = GaussianMLPPolicy(name="composer", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, use_mpc_es=True, ) runner.setup(algo, env) runner.train(n_epochs=600, plot=False, batch_size=1024)
def run_task(*_): with LocalRunner() as runner: env = TfEnv(normalize(PointEnv(goal=(-1, 0)))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) batch_size = 4000 max_path_length = 100 n_envs = batch_size // max_path_length runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size, plot=False)
def run_task(*_): """Run the job.""" with LocalRunner() as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_args = dict( # debug_nan=True, # reg_coeff=0.1, # cg_iters=2 ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=optimizer_args) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 60 env.close()
def test_batch_sampler(self): max_cpus = 8 with LocalRunner(max_cpus=max_cpus) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, max_path_length=1, whole_paths=True, discount=0.99) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args={'n_envs': max_cpus}) try: runner.initialize_tf_vars() except BaseException: raise self.failureException( "LocalRunner should be able to initialize tf variables.") runner.start_worker() paths = runner.sampler.obtain_samples(0, 8) self.assertGreaterEqual( len(paths), max_cpus, "BatchSampler should sample more than " "max_cpus=%d trajectories" % max_cpus)
def run_task(*_): with LocalRunner() as runner: env = PointEnv(goal=(3, 3), random_start=True) env = TfEnv(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, use_mpc_es=True, ) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=1024, plot=True)
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def run_task(*_): with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalLSTMPolicy( name='policy', env_spec=env.spec, lstm_layer_cls=L.TfBasicLSTMLayer, # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CMAES( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=1, batch_size=1000, n_epoch_cycles=n_samples) # No assertion on return because CMAES is not stable. env.close()
def setup_method(self): """Setup method which is called before every test.""" self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, }
def run_task(*_): """Train CEM with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CEM( env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) # NOTE: make sure that n_epoch_cycles == n_samples ! runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, max_kl_step=1e6) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20 env.close()
def test_categorical_policies(self, policy_cls): with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make("CartPole-v0"))) policy = policy_cls(name="policy", env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=0.01, plot=True, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=4000) env.close()
def run_task(snapshot_config, v): """ We wrap the main training loop in the run_task function so that run_experiment can easily execute variants of the experiment on different machines """ with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=v['step_size'], ) runner.setup(algo=algo, env=env) runner.train( n_epochs=40, batch_size=4000, # Uncomment to enable plotting # plot=True )
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalRunner(sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_episode_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def test_ppo_pendulum_gru_with_model(self): """Test PPO with Pendulum environment and GRU policy.""" with LocalRunner(sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianGRUPolicyWithModel(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalRunner(self.sess) as runner: env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def run_task(vv): with LocalRunner() as runner: env = TfEnv(normalize(gym.make('HalfCheetah-v1'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, step_size=vv["step_size"], ) runner.setup(algo=algo, env=env) runner.train( n_epochs=40, batch_size=4000, # Uncomment to enable plotting # plot=True )
def test_sac_inverted_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) assert ret > 85
def run_task(snapshot_config, *_): """Run task.""" with LocalRunner(snapshot_config=snapshot_config) as runner: n_epochs = 100 n_epoch_cycles = 20 sampler_batch_size = 500 num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size env = gym.make('PongNoFrameskip-v4') env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4) env = TfEnv(env) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(5e4), time_horizon=1) qf = DiscreteCNNQFunction( env_spec=env.spec, filter_dims=(8, 4, 3), num_filters=(32, 64, 64), strides=(4, 2, 1), dueling=False) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN( env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=0.99, min_buffer_size=int(1e4), double_q=False, n_train_steps=500, n_epoch_cycles=n_epoch_cycles, target_network_update_freq=2, buffer_batch_size=32) runner.setup(algo, env) runner.train( n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=sampler_batch_size)