def test_get_params_internal(self, obs_dim):
     box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
     with mock.patch(('metarl.tf.baselines.'
                      'gaussian_cnn_baseline.'
                      'GaussianCNNRegressor'),
                     new=SimpleGaussianCNNRegressor):
         gcb = GaussianCNNBaseline(env_spec=box_env.spec,
                                   regressor_args=dict())
     params_interal = gcb.get_params_internal()
     trainable_params = tf.compat.v1.trainable_variables(
         scope='GaussianCNNBaseline')
     assert np.array_equal(params_interal, trainable_params)
    def test_trpo_cnn_cubecrash(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = MetaRLEnv(normalize(gym.make('CubeCrash-v0')))

            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          filters=((32, (8, 8)), (64, (4, 4))),
                                          strides=(4, 2),
                                          padding='VALID',
                                          hidden_sizes=(32, 32))

            baseline = GaussianCNNBaseline(
                env_spec=env.spec,
                regressor_args=dict(filters=((32, (8, 8)), (64, (4, 4))),
                                    strides=(4, 2),
                                    padding='VALID',
                                    hidden_sizes=(32, 32),
                                    use_trust_region=True))

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        gae_lambda=0.98,
                        max_kl_step=0.01,
                        policy_ent_coeff=0.0,
                        flatten_input=False)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > -1.5

            env.close()
    def test_fit(self, obs_dim):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_cnn_baseline.'
                         'GaussianCNNRegressor'),
                        new=SimpleGaussianCNNRegressor):
            gcb = GaussianCNNBaseline(env_spec=box_env.spec)
        paths = [{
            'observations': [np.full(obs_dim, 1)],
            'returns': [1]
        }, {
            'observations': [np.full(obs_dim, 2)],
            'returns': [2]
        }]
        gcb.fit(paths)

        obs = {'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)]}
        prediction = gcb.predict(obs)
        assert np.array_equal(prediction, [1, 2])
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000):
    """Train PPO on MemorizeDigits-v0 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = MetaRLEnv(normalize(gym.make('MemorizeDigits-v0')),
                        is_image=True)
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=(
                                                  (32, (5, 5)),
                                                  (64, (3, 3)),
                                                  (64, (2, 2)),
                                              ),
                                      strides=(4, 2, 1),
                                      padding='VALID',
                                      hidden_sizes=(256, ))  # yapf: disable

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(filters=(
                                            (32, (5, 5)),
                                            (64, (3, 3)),
                                            (64, (2, 2)),
                                        ),
                                strides=(4, 2, 1),
                                padding='VALID',
                                hidden_sizes=(256, ),
                                use_trust_region=True))  # yapf: disable

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       learning_rate=1e-3,
                   ),
                   flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=batch_size)
    def test_obs_not_image(self):
        env = MetaRLEnv(DummyDiscretePixelEnv(), is_image=False)
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_cnn_baseline.'
                         'GaussianCNNRegressor'),
                        new=SimpleGaussianCNNRegressor):
            with mock.patch(
                    'metarl.tf.baselines.'
                    'gaussian_cnn_baseline.'
                    'normalize_pixel_batch',
                    side_effect=normalize_pixel_batch) as npb:

                gcb = GaussianCNNBaseline(env_spec=env.spec)

                obs_dim = env.spec.observation_space.shape
                paths = [{
                    'observations': [np.full(obs_dim, 1)],
                    'returns': [1]
                }, {
                    'observations': [np.full(obs_dim, 2)],
                    'returns': [2]
                }]

                gcb.fit(paths)
                obs = {
                    'observations': [np.full(obs_dim, 1),
                                     np.full(obs_dim, 2)]
                }
                gcb.predict(obs)
                assert not npb.called
예제 #6
0
def categorical_cnn_policy(ctxt, env_id, seed):
    """Create Categorical CNN Policy on TF-PPO.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt, max_cpus=12) as runner:
        env = MetaRLEnv(normalize(gym.make(env_id)))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=hyper_params['conv_filters'],
            conv_strides=hyper_params['conv_strides'],
            conv_pad=hyper_params['conv_pad'],
            hidden_sizes=hyper_params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                filters=hyper_params['conv_filters'],
                strides=hyper_params['conv_strides'],
                padding=hyper_params['conv_pad'],
                hidden_sizes=hyper_params['hidden_sizes'],
                use_trust_region=hyper_params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                learning_rate=1e-3,
            ),
            flatten_input=False,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_params['n_epochs'],
                     batch_size=hyper_params['batch_size'])
    def test_is_pickleable(self):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, 1)))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_cnn_baseline.'
                         'GaussianCNNRegressor'),
                        new=SimpleGaussianCNNRegressor):
            gcb = GaussianCNNBaseline(env_spec=box_env.spec)
        obs = {'observations': [np.full((1, 1), 1), np.full((1, 1), 1)]}

        with tf.compat.v1.variable_scope('GaussianCNNBaseline', reuse=True):
            return_var = tf.compat.v1.get_variable(
                'SimpleGaussianCNNModel/return_var')
        return_var.load(1.0)

        prediction = gcb.predict(obs)

        h = pickle.dumps(gcb)

        with tf.compat.v1.Session(graph=tf.Graph()):
            gcb_pickled = pickle.loads(h)
            prediction2 = gcb_pickled.predict(obs)

            assert np.array_equal(prediction, prediction2)
    def test_param_values(self, obs_dim):
        box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_cnn_baseline.'
                         'GaussianCNNRegressor'),
                        new=SimpleGaussianCNNRegressor):
            gcb = GaussianCNNBaseline(env_spec=box_env.spec)
            new_gcb = GaussianCNNBaseline(env_spec=box_env.spec,
                                          name='GaussianCNNBaseline2')

        # Manual change the parameter of GaussianCNNBaseline
        with tf.compat.v1.variable_scope('GaussianCNNBaseline', reuse=True):
            return_var = tf.compat.v1.get_variable(
                'SimpleGaussianCNNModel/return_var')
        return_var.load(1.0)

        old_param_values = gcb.get_param_values()
        new_param_values = new_gcb.get_param_values()
        assert not np.array_equal(old_param_values, new_param_values)
        new_gcb.set_param_values(old_param_values)
        new_param_values = new_gcb.get_param_values()
        assert np.array_equal(old_param_values, new_param_values)
예제 #9
0
def run_task(snapshot_config, variant_data, *_):
    """Run task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        variant_data (dict): Custom arguments for the task.

        *_ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('CubeCrash-v0')))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=(32, 64),
                                      conv_filter_sizes=(8, 4),
                                      conv_strides=(4, 2),
                                      conv_pad='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64),
                                           filter_dims=(8, 4),
                                           strides=(4, 2),
                                           padding='VALID',
                                           hidden_sizes=(32, 32),
                                           use_trust_region=True))

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01,
                    flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=variant_data['batch_size'])
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = MetaRLEnv(normalize(gym.make('CubeCrash-v0')))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=((32, (8, 8)), (64, (4, 4))),
                                      strides=(4, 2),
                                      padding='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(filters=((32, (8, 8)), (64, (4, 4))),
                                strides=(4, 2),
                                padding='VALID',
                                hidden_sizes=(32, 32),
                                use_trust_region=True))

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0,
                    flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
    def test_obs_is_image(self):
        env = MetaRLEnv(DummyDiscretePixelEnv(), is_image=True)
        with mock.patch(('metarl.tf.baselines.'
                         'gaussian_cnn_baseline.'
                         'GaussianCNNRegressor'),
                        new=SimpleGaussianCNNRegressor):
            with mock.patch(
                    'metarl.tf.baselines.'
                    'gaussian_cnn_baseline.'
                    'normalize_pixel_batch',
                    side_effect=normalize_pixel_batch) as npb:

                gcb = GaussianCNNBaseline(env_spec=env.spec)

                obs_dim = env.spec.observation_space.shape
                paths = [{
                    'observations': [np.full(obs_dim, 1)],
                    'returns': [1]
                }, {
                    'observations': [np.full(obs_dim, 2)],
                    'returns': [2]
                }]

                gcb.fit(paths)
                observations = np.concatenate(
                    [p['observations'] for p in paths])
                assert npb.call_count == 1, (
                    "Expected '%s' to have been called once. Called %s times."
                    % (npb._mock_name or 'mock', npb.call_count))
                assert (npb.call_args_list[0][0][0] == observations).all()

                obs = {
                    'observations': [np.full(obs_dim, 1),
                                     np.full(obs_dim, 2)]
                }
                observations = obs['observations']
                gcb.predict(obs)
                assert npb.call_args_list[1][0][0] == observations
def run_metarl(env, seed, log_dir):
    '''
    Create metarl model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)

    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=params['conv_filters'],
            conv_filter_sizes=params['conv_filter_sizes'],
            conv_strides=params['conv_strides'],
            conv_pad=params['conv_pad'],
            hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(num_filters=params['conv_filters'],
                                filter_dims=params['conv_filter_sizes'],
                                strides=params['conv_strides'],
                                padding=params['conv_pad'],
                                hidden_sizes=params['hidden_sizes'],
                                use_trust_region=params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
            flatten_input=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
 def test_invalid_obs_shape(self, obs_dim):
     box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim))
     with pytest.raises(ValueError):
         GaussianCNNBaseline(env_spec=box_env.spec)