示例#1
0
def generate_trajectories(args):
    # environments are not one hot coded, so we don't wrap this
    env_modifiers = environments.env_mapping[args.env]
    #if args.expert_type == 'irl':
    #    env_modifiers = environments.one_hot_wrap_modifiers(env_modifiers)

    utils.logger.configure()
    with utils.TfContext(ncpu=args.n_cpu):
        with utils.EnvironmentContext(env_name=args.env,
                                      n_envs=args.num_envs,
                                      seed=args.seed,
                                      **env_modifiers) as context:
            if args.expert_type == 'baselines_ppo':
                policy = policies.EnvPolicy.load(args.expert_path,
                                                 context.environments)
                model = policy.model
                envs = policy.envs
            elif args.expert_type == 'irl':
                policy_cfg = irl.policy_config(init_location=args.expert_path)

                policy_cfg['batching_config'] = training.make_batching_config(
                    nenvs=args.num_envs,
                    nsteps=128,
                    noptepochs=4,
                    nminibatches=4)
                irl_policy = irl.make_irl_policy(
                    policy_cfg,
                    wrapped_venv=irl.rllab_wrap_venv(context.environments),
                    baselines_venv=context.environments)
                sess = tf.get_default_session()
                sess.run(tf.local_variables_initializer())
                sess.run(tf.global_variables_initializer())
                irl_policy.restore_from_snapshot(
                    joblib.load(open(args.expert_path, 'rb'))['policy_params'])

                model = irl_policy.model
                envs = context.environments
            elif args.expert_type == 'clone':
                model = behavioral_cloning.Cloner.load(args.expert_path)
                envs = context.environments
            elif args.expert_type == 'random':
                envs = context.environments
                model = policies.RandomPolicy(envs.action_space)
            else:
                raise NotImplementedError

            ts = policies.sample_trajectories(
                model=model,
                environments=envs,
                n_trajectories=args.num_trajectories,
                one_hot_code=args.one_hot_code,
                render=args.render)

    p = os.path.join(args.expert_path, args.trajectories_file)
    pickle.dump(ts, open(p, 'wb'))
示例#2
0
def env_context_for_args(args):
    env_modifiers = environments.env_mapping[args.env]
    if args.one_hot_code:
        env_modifiers = environments.one_hot_wrap_modifiers(env_modifiers)

    return utils.EnvironmentContext(
        env_name=args.env,
        n_envs=args.num_envs,
        seed=args.seed,
        **env_modifiers
    )
示例#3
0
    def test_vectorized_sampler_processing_to_ppo_results(self):
        with utils.EnvironmentContext(env_name=self.env,
                                      n_envs=1,
                                      seed=0,
                                      **self.env_modifiers) as env_context:
            with irl.IRLContext(self.config,
                                env_config={
                                    'seed': 0,
                                    'env_name': 'PongNoFrameskip-v4',
                                    'one_hot_code': True
                                }):
                training_kwargs, _, _, _ = irl.get_training_kwargs(
                    venv=env_context.environments,
                    reward_model_cfg={
                        'expert_trajs':
                        pickle.load(
                            open('scripts/short_trajectories.pkl', 'rb')),
                    })
                training_kwargs['batch_size'] = 50
                print("Training arguments: ", training_kwargs)

                env_context.environments.reset()
                algo = irl.IRLRunner(**training_kwargs)

                algo.start_worker()
                vectorized_samples = algo.obtain_samples(0)

                # check some basic things about the vectorized samples
                # We should only have one path
                assert len(vectorized_samples) == 1
                assert_trajectory_formatted(vectorized_samples)
                # It shouldn't be super short
                assert len(vectorized_samples[0]['actions']) > 100

                sampler = sampling.PPOBatchSampler(
                    model=algo.policy.learner.model,
                    env=env_context.environments,
                    nsteps=128 * env_context.environments.num_envs)

                # These are very different because the policy is
                # non-deterministic. This test is only checking that the
                # shapes are right, and we need something more deterministic to
                # determine that the return calculation is also correct
                ppo_processed = sampler.process_trajectory(
                    vectorized_samples[0], gamma=0.99, lam=0.95).train_args()
                ppo_generated = sampler.process_to_ppo_batch(
                    sampler.run(), gamma=0.99, lam=0.95).train_args()

                assert len(ppo_processed) == len(ppo_generated)
                # the indices before the states and episode infos
                for i in range(len(ppo_processed)):
                    assert ppo_processed[i][:128].shape == ppo_generated[
                        i].shape
示例#4
0
    def test_sample_shape(self):
        def check_base_policy_sampler(algo, env_context):
            print("Checking straightforward policy trajectory sampler")
            policy_samples = policies.sample_trajectories(
                model=algo.policy.learner.model,
                environments=env_context.environments,
                one_hot_code=True,
                n_trajectories=10,
                render=False)
            assert len(policy_samples) == 10
            assert_trajectory_formatted(policy_samples)

        def check_irl_discriminator_sampler(algo, env_context):
            print("Checking discriminator sampler")
            #env_context.environments.reset()
            algo.start_worker()
            irl_discriminator_samples = algo.obtain_samples(0)
            assert_trajectory_formatted(irl_discriminator_samples)

        with utils.EnvironmentContext(env_name=self.env,
                                      n_envs=8,
                                      seed=0,
                                      **self.env_modifiers) as env_context:
            with irl.IRLContext(self.config,
                                env_config={
                                    'seed': 0,
                                    'env_name': 'PongNoFrameskip-v4',
                                    'one_hot_code': True
                                }):
                training_kwargs, _, _, _ = irl.get_training_kwargs(
                    venv=env_context.environments,
                    reward_model_cfg={
                        'expert_trajs':
                        pickle.load(
                            open('scripts/short_trajectories.pkl', 'rb')),
                    })
                print("Training arguments: ", training_kwargs)
                algo = irl.IRLRunner(**training_kwargs)
                check_base_policy_sampler(algo, env_context)
                check_irl_discriminator_sampler(algo, env_context)
示例#5
0
    def test_ppo_sampling_raveling(self):
        with utils.EnvironmentContext(env_name=self.env,
                                      n_envs=8,
                                      seed=0,
                                      **self.env_modifiers) as env_context:
            with irl.IRLContext(self.config,
                                env_config={
                                    'seed': 0,
                                    'env_name': 'PongNoFrameskip-v4',
                                    'one_hot_code': True
                                }):
                training_kwargs, _, _, _ = irl.get_training_kwargs(
                    venv=env_context.environments,
                    reward_model_cfg={
                        'expert_trajs':
                        pickle.load(
                            open('scripts/short_trajectories.pkl', 'rb')),
                    })
                training_kwargs['batch_size'] = 50
                print("Training arguments: ", training_kwargs)

                env_context.environments.reset()
                algo = irl.IRLRunner(**training_kwargs)

                ppo_sample = algo.policy.learner.runner.sample()

                train_batch_raveled_obs = ppo_sample._ravel_time_env_batch_to_train_batch(
                    ppo_sample.obs)
                # check that the second chunk of the first batch is the same as
                # the second environment in the ppo sample. This shows that we
                # stacked the environments correctly
                assert np.isclose(
                    train_batch_raveled_obs[0][ppo_sample.obs.shape[0]:],
                    ppo_sample.obs[:, 1]).all()

                # check that the roundtrip works, as a sanity check
                assert np.isclose(
                    ppo_sample.obs,
                    ppo_sample._ravel_train_batch_to_time_env_batch(
                        train_batch_raveled_obs)).all()
示例#6
0
    def test_ppo_sampling_roundtrips(self):
        with utils.EnvironmentContext(env_name=self.env,
                                      n_envs=8,
                                      seed=0,
                                      **self.env_modifiers) as env_context:
            with irl.IRLContext(self.config,
                                env_config={
                                    'seed': 0,
                                    'env_name': 'PongNoFrameskip-v4',
                                    'one_hot_code': True
                                }):
                training_kwargs, _, _, _ = irl.get_training_kwargs(
                    venv=env_context.environments,
                    reward_model_cfg={
                        'expert_trajs':
                        pickle.load(
                            open('scripts/short_trajectories.pkl', 'rb')),
                    })
                training_kwargs['batch_size'] = 50
                print("Training arguments: ", training_kwargs)

                env_context.environments.reset()
                algo = irl.IRLRunner(**training_kwargs)

                ppo_sample = algo.policy.learner.runner.sample()
                trajectories = ppo_sample.to_trajectories()
                assert_trajectory_formatted(trajectories.trajectories)
                roundtrip_sample = trajectories.to_ppo_sample()

                assert (ppo_sample.obs == roundtrip_sample.obs).all()
                assert (ppo_sample.rewards == roundtrip_sample.rewards).all()
                assert (ppo_sample.actions == roundtrip_sample.actions).all()
                assert (ppo_sample.values == roundtrip_sample.values).all()
                assert (ppo_sample.dones == roundtrip_sample.dones).all()
                assert (ppo_sample.neglogpacs == roundtrip_sample.neglogpacs
                        ).all()
                assert ppo_sample.states == roundtrip_sample.states
                assert ppo_sample.epinfos == roundtrip_sample.epinfos
                assert ppo_sample.sampler == roundtrip_sample.sampler
示例#7
0
    def test_ppo_sampling_probs_calculation(self):
        with utils.EnvironmentContext(env_name=self.env,
                                      n_envs=8,
                                      seed=0,
                                      **self.env_modifiers) as env_context:
            with irl.IRLContext(self.config,
                                env_config={
                                    'seed': 0,
                                    'env_name': 'PongNoFrameskip-v4',
                                    'one_hot_code': True
                                }):
                training_kwargs, _, _, _ = irl.get_training_kwargs(
                    venv=env_context.environments,
                    reward_model_cfg={
                        'expert_trajs':
                        pickle.load(
                            open('scripts/short_trajectories.pkl', 'rb')),
                    })
                training_kwargs['batch_size'] = 50
                print("Training arguments: ", training_kwargs)

                env_context.environments.reset()
                algo = irl.IRLRunner(**training_kwargs)

                ppo_sample = algo.policy.learner.runner.sample()

                # check that the probabilities are probabilities and sum to one
                sums = ppo_sample.probabilities.sum(axis=2)
                assert np.isclose(sums, np.ones(sums.shape)).all()

                # the probabilities are consistent with the neglogpacs
                one_hot_actions = utils.one_hot(
                    ppo_sample.actions.reshape(128 * 8), 6).reshape(128, 8, 6)
                neglogpacs = -1 * np.log(
                    (ppo_sample.probabilities * one_hot_actions).sum(axis=2))
                assert np.isclose(neglogpacs, ppo_sample.neglogpacs).all()