def test_in_local_sampler(policy, envs): true_workers = WorkerFactory(seed=100, n_workers=N_TRAJ, max_path_length=MAX_PATH_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=VecWorker, worker_args=dict(n_envs=N_TRAJ), max_path_length=MAX_PATH_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs]) n_samples = 100 true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) # Test start_rollout optimization true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def test_onehots_consistent_with_task_sampler(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld mt10 = metaworld.MT10() env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True) policy = RandomPolicy(env.action_space) workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10) sampler1 = LocalSampler.from_worker_factory(workers, policy, env) env_ups = [ SetTaskUpdate(MetaWorldSetTaskEnv, task, None) for task in env.sample_tasks(10) ] samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups) task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True) env_ups = task_sampler.sample(10) sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups) samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups) name_to_obs1 = {} for obs1, name1 in zip(samples1.observations, samples1.env_infos['task_name']): name_to_obs1[name1] = obs1 for obs2, name2 in zip(samples2.observations, samples2.env_infos['task_name']): assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
def test_in_local_sampler(policy, envs, other_envs, timesteps_per_call): true_workers = WorkerFactory(seed=100, n_workers=N_EPS, max_episode_length=MAX_EPISODE_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) worker_args = dict(n_envs=N_EPS, timesteps_per_call=timesteps_per_call) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=FragmentWorker, worker_args=worker_args, max_episode_length=MAX_EPISODE_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs]) n_samples = 400 true_eps = true_sampler.obtain_samples(0, n_samples, None) sliced_true_eps = slice_episodes(true_eps, timesteps_per_call) vec_eps = vec_sampler.obtain_samples(0, 50, None) for test_eps in vec_eps.split(): assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps) true_eps = true_sampler.obtain_samples(0, n_samples, None, env_update=other_envs) sliced_true_eps = slice_episodes(true_eps, timesteps_per_call) vec_eps = vec_sampler.obtain_samples(0, 50, None, env_update=[other_envs]) for test_eps in vec_eps.split(): assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def expert_source(env, goal, max_episode_length, n_eps): expert = OptimalPolicy(env.spec, goal=goal) workers = WorkerFactory(seed=100, max_episode_length=max_episode_length) expert_sampler = LocalSampler.from_worker_factory(workers, expert, env) for _ in range(n_eps): eps_batch = expert_sampler.obtain_samples(0, max_episode_length, None) yield TimeStepBatch.from_episode_batch(eps_batch)
def test_obtain_exact_episodes(): max_episode_length = 15 n_workers = 8 env = PointEnv() per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_episode_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_eps_per_worker = 3 episodes = sampler.obtain_exact_episodes(n_eps_per_worker, agent_update=policies) # At least one action per episode. assert sum(episodes.lengths) >= n_workers * n_eps_per_worker # All of the episodes. assert len(episodes.lengths) == n_workers * n_eps_per_worker worker = -1 for count, eps in enumerate(episodes.split()): if count % n_eps_per_worker == 0: worker += 1 assert (eps.actions == per_worker_actions[worker]).all()
def test_update_envs_env_update(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) episodes = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for eps in episodes.split(): mean_rewards.append(eps.rewards.mean()) goals.append(eps.env_infos['task'][0]['goal']) assert len(mean_rewards) == 11 assert len(goals) == 11 assert np.var(mean_rewards) > 1e-2 assert np.var(goals) > 1e-2 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1))
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = GarageEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all()
def test_local_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = LocalSampler.from_worker_factory(workers, self.policy, self.env) sampler2 = OnPolicyVectorizedSampler(self.algo, self.env) sampler2.start_worker() trajs1 = sampler1.obtain_samples( 0, 1000, tuple(self.algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker()
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) if self._test_sampler is None: env = env_updates[0]() self._max_episode_length = env.spec.max_episode_length self._test_sampler = LocalSampler.from_worker_factory( WorkerFactory(seed=get_seed(), max_episode_length=self._max_episode_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=env) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.adapt_policy(policy, eps) adapted_eps = self._test_sampler.obtain_samples( self._eval_itr, test_episodes_per_task * self._max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1
def test_init_with_env_updates(policy, envs): task_sampler = EnvPoolSampler(envs) envs = task_sampler.sample(N_TRAJ) true_workers = WorkerFactory(seed=100, n_workers=N_TRAJ, max_path_length=MAX_PATH_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=VecWorker, worker_args=dict(n_envs=N_TRAJ), max_path_length=MAX_PATH_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy], [envs]) n_samples = 100 true_trajs = true_sampler.obtain_samples(0, n_samples, None) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def test_no_seed(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) n_workers = 8 workers = WorkerFactory(seed=None, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) episodes = sampler.obtain_samples(0, 160, policy) assert sum(episodes.lengths) >= 160
def test_reset_optimization(policy, envs, other_envs): true_workers = WorkerFactory(seed=100, n_workers=N_TRAJ, max_path_length=MAX_PATH_LENGTH) true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs) vec_workers = WorkerFactory(seed=100, n_workers=1, worker_class=VecWorker, worker_args=dict(n_envs=N_TRAJ), max_path_length=MAX_PATH_LENGTH) vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy], [envs]) n_samples = 4 * MAX_PATH_LENGTH true_sampler.obtain_samples(0, n_samples, None) true_sampler.obtain_samples(0, n_samples, None) true_trajs = true_sampler.obtain_samples(0, n_samples, None, other_envs) vec_trajs = vec_sampler.obtain_samples(0, n_samples, None, [other_envs]) assert vec_trajs.lengths.sum() >= n_samples assert_trajs_eq(true_trajs, vec_trajs) true_sampler.shutdown_worker() vec_sampler.shutdown_worker()
def test_init_with_env_updates(): max_episode_length = 16 env = PointEnv() policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, envs=tasks.sample(n_workers)) episodes = sampler.obtain_samples(0, 160, policy) assert sum(episodes.lengths) >= 160
def test_adapt_policy(self): """Test if policy can adapt to samples.""" worker = WorkerFactory(seed=100, max_episode_length=100) sampler = LocalSampler.from_worker_factory(worker, self.policy, self.env) self.policy.apply(partial(self._set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() eps = sampler.obtain_samples(0, 100, adapt_policy) self.algo.adapt_policy(adapt_policy, eps) # Old policy should remain untouched self.policy.apply(partial(self._test_params, 0.1)) # Adapted policy should not be identical to old policy for v1, v2 in zip(adapt_policy.parameters(), self.policy.parameters()): if v1.data.ne(v2.data).sum() > 0: break else: pytest.fail('Parameters of adapted policy should not be ' 'identical to the old policy.')
def test_obtain_samples(ray_local_session_fixture): del ray_local_session_fixture env = GarageEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16) assert ray.is_initialized() workers = WorkerFactory(seed=100, max_path_length=algo.max_path_length, n_workers=8) sampler1 = RaySampler.from_worker_factory(workers, policy, env) sampler2 = LocalSampler.from_worker_factory(workers, policy, env) trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2.rewards[:trajs2.lengths[0]]) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()
def test_update_envs_env_update(timesteps_per_call): max_episode_length = 16 env = PointEnv() n_workers = 8 policies = [ FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_episode_length) ]) for _ in range(n_workers) ] tasks = SetTaskSampler(PointEnv) workers = WorkerFactory(seed=100, max_episode_length=max_episode_length, n_workers=n_workers, worker_class=FragmentWorker, worker_args=dict( n_envs=1, timesteps_per_call=timesteps_per_call)) sampler = LocalSampler.from_worker_factory(workers, policies, env) episodes = sampler.obtain_samples(0, 160, None, env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for eps in episodes.split(): mean_rewards.append(eps.rewards.mean()) goals.append(eps.env_infos['task'][0]['goal']) assert len(mean_rewards) == int(160 / timesteps_per_call) assert len(goals) == int(160 / timesteps_per_call) assert np.var(mean_rewards) > 1e-2 assert np.var(goals) > 1e-2 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, None, env_update=tasks.sample(n_workers + 1))
def test_obtain_samples(): env = GarageEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_episode_length=16) workers = WorkerFactory(seed=100, max_episode_length=algo.max_episode_length, n_workers=8) sampler1 = MultiprocessingSampler.from_worker_factory(workers, policy, env) sampler2 = LocalSampler.from_worker_factory(workers, policy, env) trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2.rewards[:trajs2.lengths[0]]) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()