def generate_trajectories(args): # environments are not one hot coded, so we don't wrap this env_modifiers = environments.env_mapping[args.env] #if args.expert_type == 'irl': # env_modifiers = environments.one_hot_wrap_modifiers(env_modifiers) utils.logger.configure() with utils.TfContext(ncpu=args.n_cpu): with utils.EnvironmentContext(env_name=args.env, n_envs=args.num_envs, seed=args.seed, **env_modifiers) as context: if args.expert_type == 'baselines_ppo': policy = policies.EnvPolicy.load(args.expert_path, context.environments) model = policy.model envs = policy.envs elif args.expert_type == 'irl': policy_cfg = irl.policy_config(init_location=args.expert_path) policy_cfg['batching_config'] = training.make_batching_config( nenvs=args.num_envs, nsteps=128, noptepochs=4, nminibatches=4) irl_policy = irl.make_irl_policy( policy_cfg, wrapped_venv=irl.rllab_wrap_venv(context.environments), baselines_venv=context.environments) sess = tf.get_default_session() sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) irl_policy.restore_from_snapshot( joblib.load(open(args.expert_path, 'rb'))['policy_params']) model = irl_policy.model envs = context.environments elif args.expert_type == 'clone': model = behavioral_cloning.Cloner.load(args.expert_path) envs = context.environments elif args.expert_type == 'random': envs = context.environments model = policies.RandomPolicy(envs.action_space) else: raise NotImplementedError ts = policies.sample_trajectories( model=model, environments=envs, n_trajectories=args.num_trajectories, one_hot_code=args.one_hot_code, render=args.render) p = os.path.join(args.expert_path, args.trajectories_file) pickle.dump(ts, open(p, 'wb'))
def env_context_for_args(args): env_modifiers = environments.env_mapping[args.env] if args.one_hot_code: env_modifiers = environments.one_hot_wrap_modifiers(env_modifiers) return utils.EnvironmentContext( env_name=args.env, n_envs=args.num_envs, seed=args.seed, **env_modifiers )
def test_vectorized_sampler_processing_to_ppo_results(self): with utils.EnvironmentContext(env_name=self.env, n_envs=1, seed=0, **self.env_modifiers) as env_context: with irl.IRLContext(self.config, env_config={ 'seed': 0, 'env_name': 'PongNoFrameskip-v4', 'one_hot_code': True }): training_kwargs, _, _, _ = irl.get_training_kwargs( venv=env_context.environments, reward_model_cfg={ 'expert_trajs': pickle.load( open('scripts/short_trajectories.pkl', 'rb')), }) training_kwargs['batch_size'] = 50 print("Training arguments: ", training_kwargs) env_context.environments.reset() algo = irl.IRLRunner(**training_kwargs) algo.start_worker() vectorized_samples = algo.obtain_samples(0) # check some basic things about the vectorized samples # We should only have one path assert len(vectorized_samples) == 1 assert_trajectory_formatted(vectorized_samples) # It shouldn't be super short assert len(vectorized_samples[0]['actions']) > 100 sampler = sampling.PPOBatchSampler( model=algo.policy.learner.model, env=env_context.environments, nsteps=128 * env_context.environments.num_envs) # These are very different because the policy is # non-deterministic. This test is only checking that the # shapes are right, and we need something more deterministic to # determine that the return calculation is also correct ppo_processed = sampler.process_trajectory( vectorized_samples[0], gamma=0.99, lam=0.95).train_args() ppo_generated = sampler.process_to_ppo_batch( sampler.run(), gamma=0.99, lam=0.95).train_args() assert len(ppo_processed) == len(ppo_generated) # the indices before the states and episode infos for i in range(len(ppo_processed)): assert ppo_processed[i][:128].shape == ppo_generated[ i].shape
def test_sample_shape(self): def check_base_policy_sampler(algo, env_context): print("Checking straightforward policy trajectory sampler") policy_samples = policies.sample_trajectories( model=algo.policy.learner.model, environments=env_context.environments, one_hot_code=True, n_trajectories=10, render=False) assert len(policy_samples) == 10 assert_trajectory_formatted(policy_samples) def check_irl_discriminator_sampler(algo, env_context): print("Checking discriminator sampler") #env_context.environments.reset() algo.start_worker() irl_discriminator_samples = algo.obtain_samples(0) assert_trajectory_formatted(irl_discriminator_samples) with utils.EnvironmentContext(env_name=self.env, n_envs=8, seed=0, **self.env_modifiers) as env_context: with irl.IRLContext(self.config, env_config={ 'seed': 0, 'env_name': 'PongNoFrameskip-v4', 'one_hot_code': True }): training_kwargs, _, _, _ = irl.get_training_kwargs( venv=env_context.environments, reward_model_cfg={ 'expert_trajs': pickle.load( open('scripts/short_trajectories.pkl', 'rb')), }) print("Training arguments: ", training_kwargs) algo = irl.IRLRunner(**training_kwargs) check_base_policy_sampler(algo, env_context) check_irl_discriminator_sampler(algo, env_context)
def test_ppo_sampling_raveling(self): with utils.EnvironmentContext(env_name=self.env, n_envs=8, seed=0, **self.env_modifiers) as env_context: with irl.IRLContext(self.config, env_config={ 'seed': 0, 'env_name': 'PongNoFrameskip-v4', 'one_hot_code': True }): training_kwargs, _, _, _ = irl.get_training_kwargs( venv=env_context.environments, reward_model_cfg={ 'expert_trajs': pickle.load( open('scripts/short_trajectories.pkl', 'rb')), }) training_kwargs['batch_size'] = 50 print("Training arguments: ", training_kwargs) env_context.environments.reset() algo = irl.IRLRunner(**training_kwargs) ppo_sample = algo.policy.learner.runner.sample() train_batch_raveled_obs = ppo_sample._ravel_time_env_batch_to_train_batch( ppo_sample.obs) # check that the second chunk of the first batch is the same as # the second environment in the ppo sample. This shows that we # stacked the environments correctly assert np.isclose( train_batch_raveled_obs[0][ppo_sample.obs.shape[0]:], ppo_sample.obs[:, 1]).all() # check that the roundtrip works, as a sanity check assert np.isclose( ppo_sample.obs, ppo_sample._ravel_train_batch_to_time_env_batch( train_batch_raveled_obs)).all()
def test_ppo_sampling_roundtrips(self): with utils.EnvironmentContext(env_name=self.env, n_envs=8, seed=0, **self.env_modifiers) as env_context: with irl.IRLContext(self.config, env_config={ 'seed': 0, 'env_name': 'PongNoFrameskip-v4', 'one_hot_code': True }): training_kwargs, _, _, _ = irl.get_training_kwargs( venv=env_context.environments, reward_model_cfg={ 'expert_trajs': pickle.load( open('scripts/short_trajectories.pkl', 'rb')), }) training_kwargs['batch_size'] = 50 print("Training arguments: ", training_kwargs) env_context.environments.reset() algo = irl.IRLRunner(**training_kwargs) ppo_sample = algo.policy.learner.runner.sample() trajectories = ppo_sample.to_trajectories() assert_trajectory_formatted(trajectories.trajectories) roundtrip_sample = trajectories.to_ppo_sample() assert (ppo_sample.obs == roundtrip_sample.obs).all() assert (ppo_sample.rewards == roundtrip_sample.rewards).all() assert (ppo_sample.actions == roundtrip_sample.actions).all() assert (ppo_sample.values == roundtrip_sample.values).all() assert (ppo_sample.dones == roundtrip_sample.dones).all() assert (ppo_sample.neglogpacs == roundtrip_sample.neglogpacs ).all() assert ppo_sample.states == roundtrip_sample.states assert ppo_sample.epinfos == roundtrip_sample.epinfos assert ppo_sample.sampler == roundtrip_sample.sampler
def test_ppo_sampling_probs_calculation(self): with utils.EnvironmentContext(env_name=self.env, n_envs=8, seed=0, **self.env_modifiers) as env_context: with irl.IRLContext(self.config, env_config={ 'seed': 0, 'env_name': 'PongNoFrameskip-v4', 'one_hot_code': True }): training_kwargs, _, _, _ = irl.get_training_kwargs( venv=env_context.environments, reward_model_cfg={ 'expert_trajs': pickle.load( open('scripts/short_trajectories.pkl', 'rb')), }) training_kwargs['batch_size'] = 50 print("Training arguments: ", training_kwargs) env_context.environments.reset() algo = irl.IRLRunner(**training_kwargs) ppo_sample = algo.policy.learner.runner.sample() # check that the probabilities are probabilities and sum to one sums = ppo_sample.probabilities.sum(axis=2) assert np.isclose(sums, np.ones(sums.shape)).all() # the probabilities are consistent with the neglogpacs one_hot_actions = utils.one_hot( ppo_sample.actions.reshape(128 * 8), 6).reshape(128, 8, 6) neglogpacs = -1 * np.log( (ppo_sample.probabilities * one_hot_actions).sum(axis=2)) assert np.isclose(neglogpacs, ppo_sample.neglogpacs).all()