def test_get_params_internal(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): gcb = GaussianCNNBaseline(env_spec=box_env.spec, regressor_args=dict()) params_interal = gcb.get_params_internal() trainable_params = tf.compat.v1.trainable_variables( scope='GaussianCNNBaseline') assert np.array_equal(params_interal, trainable_params)
def test_trpo_cnn_cubecrash(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -1.5 env.close()
def test_fit(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): gcb = GaussianCNNBaseline(env_spec=box_env.spec) paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] gcb.fit(paths) obs = {'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)]} prediction = gcb.predict(obs) assert np.array_equal(prediction, [1, 2])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) # yapf: disable algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=batch_size)
def test_obs_not_image(self): env = MetaRLEnv(DummyDiscretePixelEnv(), is_image=False) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): with mock.patch( 'metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'normalize_pixel_batch', side_effect=normalize_pixel_batch) as npb: gcb = GaussianCNNBaseline(env_spec=env.spec) obs_dim = env.spec.observation_space.shape paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] gcb.fit(paths) obs = { 'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)] } gcb.predict(obs) assert not npb.called
def categorical_cnn_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=hyper_params['conv_filters'], conv_strides=hyper_params['conv_strides'], conv_pad=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict( filters=hyper_params['conv_filters'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes'], use_trust_region=hyper_params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), flatten_input=False, ) runner.setup(algo, env) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['batch_size'])
def test_is_pickleable(self): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, 1))) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): gcb = GaussianCNNBaseline(env_spec=box_env.spec) obs = {'observations': [np.full((1, 1), 1), np.full((1, 1), 1)]} with tf.compat.v1.variable_scope('GaussianCNNBaseline', reuse=True): return_var = tf.compat.v1.get_variable( 'SimpleGaussianCNNModel/return_var') return_var.load(1.0) prediction = gcb.predict(obs) h = pickle.dumps(gcb) with tf.compat.v1.Session(graph=tf.Graph()): gcb_pickled = pickle.loads(h) prediction2 = gcb_pickled.predict(obs) assert np.array_equal(prediction, prediction2)
def test_param_values(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): gcb = GaussianCNNBaseline(env_spec=box_env.spec) new_gcb = GaussianCNNBaseline(env_spec=box_env.spec, name='GaussianCNNBaseline2') # Manual change the parameter of GaussianCNNBaseline with tf.compat.v1.variable_scope('GaussianCNNBaseline', reuse=True): return_var = tf.compat.v1.get_variable( 'SimpleGaussianCNNModel/return_var') return_var.load(1.0) old_param_values = gcb.get_param_values() new_param_values = new_gcb.get_param_values() assert not np.array_equal(old_param_values, new_param_values) new_gcb.set_param_values(old_param_values) new_param_values = new_gcb.get_param_values() assert np.array_equal(old_param_values, new_param_values)
def run_task(snapshot_config, variant_data, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. variant_data (dict): Custom arguments for the task. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64), conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pad='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=variant_data['batch_size'])
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
def test_obs_is_image(self): env = MetaRLEnv(DummyDiscretePixelEnv(), is_image=True) with mock.patch(('metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'GaussianCNNRegressor'), new=SimpleGaussianCNNRegressor): with mock.patch( 'metarl.tf.baselines.' 'gaussian_cnn_baseline.' 'normalize_pixel_batch', side_effect=normalize_pixel_batch) as npb: gcb = GaussianCNNBaseline(env_spec=env.spec) obs_dim = env.spec.observation_space.shape paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] gcb.fit(paths) observations = np.concatenate( [p['observations'] for p in paths]) assert npb.call_count == 1, ( "Expected '%s' to have been called once. Called %s times." % (npb._mock_name or 'mock', npb.call_count)) assert (npb.call_args_list[0][0][0] == observations).all() obs = { 'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)] } observations = obs['observations'] gcb.predict(obs) assert npb.call_args_list[1][0][0] == observations
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=params['conv_filters'], conv_filter_sizes=params['conv_filter_sizes'], conv_strides=params['conv_strides'], conv_pad=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(num_filters=params['conv_filters'], filter_dims=params['conv_filter_sizes'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), flatten_input=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['batch_size']) dowel_logger.remove_all() return tabular_log_file
def test_invalid_obs_shape(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with pytest.raises(ValueError): GaussianCNNBaseline(env_spec=box_env.spec)