def test_categorial_mlp_policy(self): categorical_mlp_policy = CategoricalMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.global_variables_initializer()) obs = self.env.observation_space.high assert categorical_mlp_policy.get_action(obs)
def fixture_exp(snapshot_config, sess): """Dummy fixture experiment function. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. sess (tf.Session): An optional TensorFlow session. A new session will be created immediately if not provided. Returns: np.ndarray: Values of the parameters evaluated in the current session """ with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = GarageEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env, sampler_cls=LocalSampler) runner.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def test_get_regularizable_vars(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalMLPPolicy(env_spec=env.spec) reg_vars = policy.get_regularizable_vars() assert len(reg_vars) == 2 for var in reg_vars: assert ('bias' not in var.name) and ('output' not in var.name)
def integrate_new_skill(self, new_skill_id, new_skill_subpath): skill_integration_method = CategoricalMLPSkillIntegrator.Method.SUBPATH_SKILLS_AVG ## Hierarchized environment hrl_env = HierarchizedEnv( # base env that was wrapped in HierarchizedEnv (not fully unwrapped - may be normalized!) env=self.env.env.env, num_orig_skills=self._hrl_policy.num_skills ) tf_hrl_env = TfEnv(hrl_env) ## Top policy # 1) Get old policy from saved data old_top_policy = self._hrl_policy.get_top_policy() # 2) Get weights of old top policy otp_weights = unflatten_tensors( old_top_policy.get_param_values(), old_top_policy.get_param_shapes() ) # 3) Create weights for new top policy skill_integrator = CategoricalMLPSkillIntegrator() ntp_weight_values = skill_integrator.integrate_skill( old_policy_weights=otp_weights, method=skill_integration_method, # Specific parameters for START_OBSS_SKILLS_AVG subpath_start_obss=new_skill_subpath['start_observations'], top_policy=old_top_policy, # Specific parameters for SUBPATH_SKILLS_AVG, SUBPATH_SKILLS_SMOOTH_AVG and SUBPATH_FIRST_SKILL subpath_actions=new_skill_subpath['actions'] ) # 4) Create new policy and randomly initialize its weights new_top_policy = CategoricalMLPPolicy( env_spec=tf_hrl_env.spec, # This env counts with new skill (action space = n + 1) hidden_sizes=(32, 32), # As was in asa_test.py, name='CategoricalMLPPolicyWithSkill{}'.format(new_skill_id) ) ntp_init_op = tf.variables_initializer(new_top_policy.get_params()) ntp_init_op.run() # 5) Fill new policy with adjusted weights new_top_policy.set_param_values( flattened_params=flatten_tensors(ntp_weight_values) ) ## Adjust HRL policy and training algorithms self._hrl_policy.top_policy = new_top_policy hrl_env.set_hrl_policy(self._hrl_policy) self.env = tf_hrl_env self.policy=self._hrl_policy.get_top_policy() self._top_algo = self._top_algo_cls( env=tf_hrl_env, policy=self._hrl_policy.get_top_policy(), baseline=self.baseline, **self._top_algo_kwargs ) self.sampler = self._top_algo.sampler self.start_worker(self._tf_sess)
def test_is_pickleable(self, obs_dim, action_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalMLPPolicy(env_spec=env.spec) obs = env.reset() with tf.compat.v1.variable_scope('CategoricalMLPPolicy', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist output1 = self.sess.run([dist_sym.probs], feed_dict={state_input: [[obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy_pickled.build(state_input, name='dist_sym').dist output2 = sess.run([dist_sym.probs], feed_dict={state_input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_is_pickleable(self, obs_dim, action_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalMLPPolicy(env_spec=env.spec) policy.build(obs_var) obs = env.reset() with tf.compat.v1.variable_scope( 'CategoricalMLPPolicy/CategoricalMLPModel', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy_pickled.build(obs_var) output2 = sess.run( [policy_pickled.distribution.probs], feed_dict={policy_pickled.model.input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_clone(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4)) policy = CategoricalMLPPolicy(env_spec=env.spec) policy_clone = policy.clone('CategoricalMLPPolicyClone') assert policy.env_spec == policy_clone.env_spec for cloned_param, param in zip(policy_clone.parameters.values(), policy.parameters.values()): assert np.array_equal(cloned_param, param)
def test_get_regularizable_vars(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalMLPPolicy(env_spec=env.spec) policy.build(obs_var) reg_vars = policy.get_regularizable_vars() assert len(reg_vars) == 2 for var in reg_vars: assert ('bias' not in var.name) and ('output' not in var.name)
def test_get_action(self, obs_dim, action_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalMLPPolicy(env_spec=env.spec) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_dist_info(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'categorical_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = CategoricalMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) policy_probs = policy.dist_info([obs.flatten()]) assert np.array_equal(policy_probs['prob'][0], expected_prob)
def test_build(self, obs_dim, action_dim): env = GymEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalMLPPolicy(env_spec=env.spec) obs = env.reset()[0] state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist output1 = self.sess.run([dist_sym.probs], feed_dict={state_input: [[obs.flatten()]]}) output2 = self.sess.run([dist_sym2.probs], feed_dict={state_input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, max_kl_step=1e6) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def vpg_cartpole(ctxt=None, seed=1): """Train VPG with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=10000)
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalRunner(sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def cem_cartpole(ctxt=None, seed=1): """Train CEM with Cartpole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = GarageEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=1000)
def test_batch_sampler(self): max_cpus = 8 with LocalRunner(max_cpus=max_cpus) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, max_path_length=1, whole_paths=True, discount=0.99) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args={'n_envs': max_cpus}) try: runner.initialize_tf_vars() except BaseException: raise self.failureException( "LocalRunner should be able to initialize tf variables.") runner.start_worker() paths = runner.sampler.obtain_samples(0, 8) self.assertGreaterEqual( len(paths), max_cpus, "BatchSampler should sample more than " "max_cpus=%d trajectories" % max_cpus)
def trpo_gym_tf_cartpole(ctxt=None, seed=1): """Train TRPO with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, discount=0.99, max_kl_step=0.01, ) trainer.setup(algo, env) trainer.train(n_epochs=10, batch_size=10000, plot=False)
def run_task(*_): env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = InstrumentedTRPO( env=env, policy=policy, baseline=baseline, batch_size=1024, max_path_length=100, n_itr=4, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0, plot=True, ) algo.train() env.close()
def reps_gym_cartpole(ctxt=None, seed=1): """Train REPS with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = GymEnv('CartPole-v0') policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) trainer.train(n_epochs=100, batch_size=4000, plot=False)
def erwr_cartpole(ctxt=None, seed=1): """Train with ERWR on CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, max_episode_length=100, discount=0.99) runner.setup(algo=algo, env=env) runner.train(n_epochs=100, batch_size=10000, plot=False)
def test_ppo_pendulum_flatten_input(self): """Test PPO with CartPole to test observation flattening.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv( normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2)))) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def trpo_cartpole(ctxt=None, seed=1): """Train TRPO with CartPole-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def test_process_samples_discrete_non_recurrent(self): env = TfEnv(DummyDiscreteEnv()) policy = CategoricalMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyDiscreteEnv # always terminate, number of paths must be max_path_length, and # batch size must be max_path_length as well, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.n) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is 100 path assert samples['lengths'].shape == (max_path_length, ) # non-recurrent policy has empty agent info assert samples['agent_infos'] == {} assert isinstance(samples['average_return'], float)
def run_task(snapshot_config, *_): """Train CEM with Cartpole-v1 environment. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=100, batch_size=1000)
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" with TFTrainer(snapshot_config, sess=self.sess) as trainer: deterministic.set_seed(1) env = GymEnv('CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 60 env.close()
def reps_gym_cartpole(ctxt=None, seed=1): """Train REPS with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = GarageEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000, plot=False)
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with LocalTFRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CMAES( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=1, batch_size=1000, n_epoch_cycles=n_samples) # No assertion on return because CMAES is not stable. env.close()
def run_task(*_): """Train CEM with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CEM( env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) # NOTE: make sure that n_epoch_cycles == n_samples ! runner.train(n_epochs=100, batch_size=1000, n_epoch_cycles=n_samples)
def run_task(snapshot_config, v): """ We wrap the main training loop in the run_task function so that run_experiment can easily execute variants of the experiment on different machines """ with LocalRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=v['step_size'], ) runner.setup(algo=algo, env=env) runner.train( n_epochs=40, batch_size=4000, # Uncomment to enable plotting # plot=True )