Exemplo n.º 1
0
    def test_maml_sampling(self):
        # get from data
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy3",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            grad_step_size=0.1,
            hidden_nonlinearity=tf.nn.tanh,
            trainable_step_size=False,
            bias_transform=False
        )

        from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=1)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )
            algo.meta_batch_size = dynamics_model.num_models

            algo.batch_size_dynamics_samples = algo.batch_size

            algo.dynamics_model = dynamics_model

            itr = 1

            model_sampler = MAMLModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr, return_dict=True)
            samples_data = model_sampler.process_samples(itr, paths[0])

            print(samples_data.keys())
Exemplo n.º 2
0
    def test_model_sampling_with_dummy_different_meta_batch_size(self):
        env = DummyEnv()
        dynamics_dummy = DummyDynamicsEnsemble("dyn_model4", env, num_models=4)
        env = TfEnv(normalize(DummyEnv()))

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy4",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            grad_step_size=0.1,
            hidden_nonlinearity=tf.nn.tanh,
            trainable_step_size=False,
            bias_transform=False
        )

        from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )
            algo.meta_batch_size = dynamics_dummy.num_models * 2

            algo.batch_size_dynamics_samples = algo.batch_size
            algo.dynamics_model = dynamics_dummy

            itr = 1

            model_sampler = MAMLModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr, return_dict=True)

            n_steps_per_model = np.array(
                [np.sum([path['observations'].shape[0] for path in model_paths]) for model_paths in paths.values()])

            self.assertTrue(
                all(np.abs(n_steps_per_model - algo.batch_size // algo.meta_batch_size) <= algo.max_path_length))

            for i in range(dynamics_dummy.num_models):
                for path in paths[i]:
                    self.assertTrue(
                        (np.logical_or(path['observations'] == 1.0, path['observations'] == i//2 * 0.01)).all())
Exemplo n.º 3
0
    def test_model_sampling_with_given_traj_starting_obs(self):
        env = DummyEnv()
        dynamics_dummy = DummyDynamicsEnsemble("dyn_model4", env, num_models=4)
        env = TfEnv(normalize(DummyEnv()))

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy4",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            grad_step_size=0.1,
            hidden_nonlinearity=tf.nn.tanh,
            trainable_step_size=False,
            bias_transform=False
        )

        from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )
            algo.meta_batch_size = dynamics_dummy.num_models * 2

            algo.batch_size_dynamics_samples = algo.batch_size
            algo.dynamics_model = dynamics_dummy

            itr = 1

            model_sampler = MAMLModelVectorizedSampler(algo)
            model_sampler.start_worker()

            traj_starting_obs = np.array([[-1, -1],[-0.5, -0.5]])
            paths = model_sampler.obtain_samples(itr, return_dict=True, traj_starting_obs=traj_starting_obs)

            for i in range(dynamics_dummy.num_models):
                for path in paths[i]:
                    print(path['observations'][0])
                    print(np.abs(np.mean(path['observations'][0]) + 1.0) < 0.001)
                    self.assertTrue(
                        np.abs(np.mean(path['observations'][0]) + 1.0) < 0.001 or np.abs(np.mean(path['observations'][0])+0.5) < 0.001)
Exemplo n.º 4
0
    def test_random_sampling(self):
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsModel("dyn_model2", env, hidden_sizes=(16,16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = GaussianMLPPolicy(
            name="policy2",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )

            algo.dynamics_model = dynamics_model

            itr = 1

            random_sampler = RandomVectorizedSampler(algo)
            random_sampler.start_worker()
            paths = random_sampler.obtain_samples(itr)
            samples_data = random_sampler.process_samples(itr, paths)

            self.assertTrue(set(samples_data.keys()) >= set(['actions_dynamics', 'next_observations_dynamics', 'observations_dynamics']))