예제 #1
0
    def test_maml_sampling(self):
        # get from data
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_model3", env, hidden_sizes=(16,16), num_models=4)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = MAMLImprovedGaussianMLPPolicy(
            name="policy3",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            grad_step_size=0.1,
            hidden_nonlinearity=tf.nn.tanh,
            trainable_step_size=False,
            bias_transform=False
        )

        from rllab_maml.baselines.linear_feature_baseline import LinearFeatureBaseline
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=1)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )
            algo.meta_batch_size = dynamics_model.num_models

            algo.batch_size_dynamics_samples = algo.batch_size

            algo.dynamics_model = dynamics_model

            itr = 1

            model_sampler = MAMLModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr, return_dict=True)
            samples_data = model_sampler.process_samples(itr, paths[0])

            print(samples_data.keys())
예제 #2
0
    def test_policy_sampling(self):
        # get from data
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=100, horizon=100)
        dynamics_model = MLPDynamicsEnsemble("dyn_model1", env, hidden_sizes=(16,16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        env = TfEnv(normalize(PointEnv()))

        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(16, 16),
            hidden_nonlinearity=tf.nn.tanh
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        # fit dynamics model
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)

            algo = ModelMAMLTRPO(
                env=env,
                dynamics_model=dynamics_model,
                policy=policy,
                baseline=baseline,
                batch_size=20000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
            )

            algo.dynamics_model = dynamics_model

            itr = 1

            model_sampler = ModelVectorizedSampler(algo)
            model_sampler.start_worker()
            paths = model_sampler.obtain_samples(itr)
            samples_data = model_sampler.process_samples(itr, paths)

            print(samples_data.keys())
예제 #3
0
    def test_training(self):
        env = TfEnv(normalize(PointEnv()))

        tf.set_random_seed(22)
        np.random.seed(22)

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(16, 16),
                                   hidden_nonlinearity=tf.nn.tanh)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        dynamics_model = MLPDynamicsModel("dyn_model",
                                          env,
                                          hidden_sizes=(16, 16))

        # fit dynamics model

        algo = ModelTRPO(
            env=env,
            policy=policy,
            dynamics_model=dynamics_model,
            baseline=baseline,
            batch_size_env_samples=5000,
            initial_random_samples=10000,
            batch_size_dynamics_samples=40000,
            max_path_length=100,
            dynamic_model_epochs=(30, 10),
            num_gradient_steps_per_iter=2,
            n_itr=20,
            discount=0.99,
            step_size=0.001,
        )
        algo.train()
예제 #4
0
    def test_train_prediction2(self):
        # just checks if training and prediction runs without errors and prediction returns correct shapes

        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=500, horizon=100)
        dynamics_model = MLPDynamicsModel("dyn_model_2b", env, hidden_sizes=(32, 32), normalize_input=True)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        obs_test = np.random.uniform(-2, 2, size=(20000, 2))
        act_test = np.random.uniform(-0.1, 0.1, size=(20000, 2))
        obs_next_test = obs_test + act_test

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=20, verbose=True)
            obs_next_pred = dynamics_model.predict(obs_test, act_test)

            mean_diff = np.mean(np.abs(obs_next_test - obs_next_pred))
            print("Mean Diff:", mean_diff)

            self.assertEqual(obs_next_pred.shape, obs_test.shape)
            self.assertLessEqual(mean_diff, 0.01)
예제 #5
0
    def test_train_prediction(self):
        # just checks if training and prediction runs without errors and prediction returns correct shapes
        env = PointEnv()
        np.random.seed(22)
        paths = sample_random_trajectories_point_env(env, num_paths=200, horizon=100)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_2", env, hidden_sizes=(16, 16), num_models=5)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100)
        obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0)
        obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0)
        act_test = np.concatenate([path['actions'] for path in paths_test], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=10)


            obs_pred1 = dynamics_model.predict(obs_test, act_test, pred_type='mean')
            diff1 = np.mean(np.abs(obs_pred1 - obs_next_test) ** 2)
            self.assertEqual(obs_pred1.shape, obs_test.shape)
            self.assertLess(diff1, 0.01)

            obs_pred2 = dynamics_model.predict(obs_test, act_test, pred_type='rand')
            diff2 = np.mean(np.abs(obs_pred2 - obs_next_test) ** 2)
            self.assertEqual(obs_pred2.shape, obs_test.shape)
            self.assertLess(diff2, 0.01)

            obs_pred3 = dynamics_model.predict(obs_test, act_test, pred_type='all')
            self.assertEqual(obs_pred3.shape, obs_test.shape + (5,))
예제 #6
0
    def test_predict_model_batches3(self):
        np.random.seed(22)
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_6", env, hidden_sizes=(16, 16), num_models=2, output_bias_range=0.01, gaussian_noise_output_std=0.01)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        obs_stacked = np.concatenate([obs, obs+0.2], axis=0)
        act_stacked = np.concatenate([act+0.1, act], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            dynamics_model.fit(obs, act, obs_next, epochs=5)

            pred_obs = dynamics_model.predict_model_batches(obs_stacked, act_stacked)
            pred_obs_batches = np.split(pred_obs, 2, axis=0)
            for i in range(2):
                if i > 0:
                    act = act - 0.1
                    obs = obs + 0.2
                if i == 0:
                    act = act + 0.1
                pred_obs_single_batch = dynamics_model.predict(obs, act, pred_type='all')[:, :, i]
                diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch))
                print(diff)
                self.assertGreaterEqual(diff, 10.0)
예제 #7
0
    def test_serialization(self):
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_1", env, hidden_sizes=(16, 16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)
            obs_pred = dynamics_model.predict(obs, act, pred_type='mean')

            dump_string = pickle.dumps(dynamics_model)

        tf.reset_default_graph()
        with tf.Session() as sess:
            dynamics_model_loaded = pickle.loads(dump_string)
            # dynamics_model_loaded.fit(obs, act, obs_next, epochs=5)
            obs_pred_loaded = dynamics_model_loaded.predict(obs, act, pred_type='mean')

        diff = np.sum(np.abs(obs_pred_loaded - obs_pred))

        self.assertAlmostEquals(diff, 0, places=2)
예제 #8
0
    def test_train_prediction_std(self):
        # just checks if std prediction returns correct shapes
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=5)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=5)
            std = dynamics_model.predict_std(obs, act)
            self.assertEqual(std.shape, obs.shape)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', type=str, default='PointEnv')
    # Experiment meta-params
    parser.add_argument('--exp_name', type=str, default='mb_mpc')
    parser.add_argument('--seed', type=int, default=3)
    parser.add_argument('--render', action='store_true')
    # Training args
    parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3)
    parser.add_argument('--onpol_iters', '-n', type=int, default=15)
    parser.add_argument('--dyn_iters', '-nd', type=int, default=60)
    parser.add_argument('--batch_size', '-b', type=int, default=512)
    # Data collection
    parser.add_argument('--random_paths', '-r', type=int, default=1000) #TODO change back to 10000
    parser.add_argument('--onpol_paths', '-d', type=int, default=10)
    parser.add_argument('--simulated_paths', '-sp', type=int, default=10)  #TODO change back to 1000
    parser.add_argument('--ep_len', '-ep', type=int, default=1000)
    # Neural network architecture args
    parser.add_argument('--n_layers', '-l', type=int, default=2)
    parser.add_argument('--size', '-s', type=int, default=500)
    # MPC Controller
    parser.add_argument('--mpc_horizon', '-m', type=int, default=15)
    args = parser.parse_args()

    # Set seed
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Make env
    if args.env_name is "PointEnv":
        env = PointEnv()
        reward_fn = reward_fn_point_env

    train(env=env,
          reward_fn=reward_fn,
          render=args.render,
          learning_rate=args.learning_rate,
          onpol_iters=args.onpol_iters,
          dynamics_iters=args.dyn_iters,
          batch_size=args.batch_size,
          num_paths_random=args.random_paths,
          num_paths_onpol=args.onpol_paths,
          num_simulated_paths=args.simulated_paths,
          env_horizon=args.ep_len,
          mpc_horizon=args.mpc_horizon,
          )
예제 #10
0
    def test_predict_model_batches(self):
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_3", env, hidden_sizes=(16, 16), num_models=1)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)


        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            dynamics_model.fit(obs, act, obs_next, epochs=5)

            pred_obs = dynamics_model.predict_model_batches(obs, act)
            pred_obs_single = dynamics_model.predict(obs, act, pred_type='all')[:, :, 0]
            diff = np.sum(np.abs(pred_obs - pred_obs_single))
            print(diff)
            self.assertAlmostEqual(diff, 0)
예제 #11
0
    def test_train_prediction1(self):
        env = PointEnv()
        obs = np.random.uniform(-2, 2, size=(20000, 2))
        act = np.random.uniform(-0.1, 0.1, size=(20000, 2))
        next_obs = obs + act

        dynamics_model = MLPDynamicsModel("dyn_model_2a", env, hidden_sizes=(32, 32), normalize_input=False)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, next_obs, epochs=10, verbose=True)

            obs_test = np.random.uniform(-2, 2, size=(20000, 2))
            act_test = np.random.uniform(-0.1, 0.1, size=(20000, 2))
            obs_next_test = obs_test + act_test

            obs_next_pred = dynamics_model.predict(obs_test, act_test)
            mean_diff = np.mean(np.abs(obs_next_test - obs_next_pred))
            print("Mean Diff:", mean_diff)

            self.assertEqual(obs_next_pred.shape, obs_test.shape)
            self.assertLessEqual(mean_diff, 0.01)
예제 #12
0
    def test_predict_model_batches2(self):
        np.random.seed(22)
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=10, horizon=10)
        dynamics_model = BadDynamicsEnsemble("bad_dyn_ensemble_5", env, hidden_sizes=(16, 16), num_models=2, output_bias_range=0.0, gaussian_noise_output_std=0.0)

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            dynamics_model.fit(obs, act, obs_next, epochs=5)

            pred_obs = dynamics_model.predict_model_batches(obs, act)
            pred_obs_batches = np.split(pred_obs, 2, axis=0)

            for i in range(2):
                pred_obs_single_batch = dynamics_model.predict(obs[(i*5000):((i+1)*5000)], act[(i*5000):((i+1)*5000)], pred_type='all')[:, :, i]
                diff = np.sum(np.abs(pred_obs_batches[i] - pred_obs_single_batch))
                print(diff)
                self.assertAlmostEquals(diff, 0)
예제 #13
0
    def test_train_prediction_performance(self):
        # just checks if training and prediction runs without errors and prediction returns correct shapes
        env = PointEnv()
        paths = sample_random_trajectories_point_env(env, num_paths=500, horizon=500)
        dynamics_model = MLPDynamicsModel("dyn_model_3", env, hidden_sizes=(16, 16))

        obs = np.concatenate([path['observations'] for path in paths], axis=0)
        obs_next = np.concatenate([path['next_observations'] for path in paths], axis=0)
        act = np.concatenate([path['actions'] for path in paths], axis=0)

        paths_test = sample_random_trajectories_point_env(env, num_paths=10, horizon=100)
        obs_test = np.concatenate([path['observations'] for path in paths_test], axis=0)
        obs_next_test = np.concatenate([path['next_observations'] for path in paths_test], axis=0)
        act_test = np.concatenate([path['actions'] for path in paths_test], axis=0)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            dynamics_model.fit(obs, act, obs_next, epochs=20)

            next_obs_pred = dynamics_model.predict(obs_test, act_test)
            diff = np.mean(np.abs(next_obs_pred-obs_next_test)**2)
            print("DIFF:", diff)
            self.assertLess(diff, 0.05)