Exemplo n.º 1
0
def test_in_local_sampler(policy, envs):
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_TRAJ,
                                 max_path_length=MAX_PATH_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=VecWorker,
                                worker_args=dict(n_envs=N_TRAJ),
                                max_path_length=MAX_PATH_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs])
    n_samples = 100

    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)
    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    # Test start_rollout optimization

    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)
    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
def test_onehots_consistent_with_task_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    import metaworld
    mt10 = metaworld.MT10()
    env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True)
    policy = RandomPolicy(env.action_space)
    workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10)
    sampler1 = LocalSampler.from_worker_factory(workers, policy, env)
    env_ups = [
        SetTaskUpdate(MetaWorldSetTaskEnv, task, None)
        for task in env.sample_tasks(10)
    ]
    samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups)
    task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True)
    env_ups = task_sampler.sample(10)
    sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups)
    samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups)
    name_to_obs1 = {}
    for obs1, name1 in zip(samples1.observations,
                           samples1.env_infos['task_name']):
        name_to_obs1[name1] = obs1
    for obs2, name2 in zip(samples2.observations,
                           samples2.env_infos['task_name']):
        assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
Exemplo n.º 3
0
def test_in_local_sampler(policy, envs, other_envs, timesteps_per_call):
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_EPS,
                                 max_episode_length=MAX_EPISODE_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    worker_args = dict(n_envs=N_EPS, timesteps_per_call=timesteps_per_call)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=FragmentWorker,
                                worker_args=worker_args,
                                max_episode_length=MAX_EPISODE_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, policy, [envs])
    n_samples = 400

    true_eps = true_sampler.obtain_samples(0, n_samples, None)
    sliced_true_eps = slice_episodes(true_eps, timesteps_per_call)

    vec_eps = vec_sampler.obtain_samples(0, 50, None)
    for test_eps in vec_eps.split():
        assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps)

    true_eps = true_sampler.obtain_samples(0,
                                           n_samples,
                                           None,
                                           env_update=other_envs)
    sliced_true_eps = slice_episodes(true_eps, timesteps_per_call)

    vec_eps = vec_sampler.obtain_samples(0, 50, None, env_update=[other_envs])
    for test_eps in vec_eps.split():
        assert any(eps_eq(true_eps, test_eps) for true_eps in sliced_true_eps)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
Exemplo n.º 4
0
def expert_source(env, goal, max_episode_length, n_eps):
    expert = OptimalPolicy(env.spec, goal=goal)
    workers = WorkerFactory(seed=100, max_episode_length=max_episode_length)
    expert_sampler = LocalSampler.from_worker_factory(workers, expert, env)
    for _ in range(n_eps):
        eps_batch = expert_sampler.obtain_samples(0, max_episode_length, None)
        yield TimeStepBatch.from_episode_batch(eps_batch)
Exemplo n.º 5
0
def test_obtain_exact_episodes():
    max_episode_length = 15
    n_workers = 8
    env = PointEnv()
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_episode_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policies, envs=env)
    n_eps_per_worker = 3
    episodes = sampler.obtain_exact_episodes(n_eps_per_worker,
                                             agent_update=policies)
    # At least one action per episode.
    assert sum(episodes.lengths) >= n_workers * n_eps_per_worker
    # All of the episodes.
    assert len(episodes.lengths) == n_workers * n_eps_per_worker
    worker = -1
    for count, eps in enumerate(episodes.split()):
        if count % n_eps_per_worker == 0:
            worker += 1
        assert (eps.actions == per_worker_actions[worker]).all()
Exemplo n.º 6
0
def test_update_envs_env_update():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policy, env)
    episodes = sampler.obtain_samples(0,
                                      161,
                                      np.asarray(policy.get_param_values()),
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for eps in episodes.split():
        mean_rewards.append(eps.rewards.mean())
        goals.append(eps.env_infos['task'][0]['goal'])
    assert len(mean_rewards) == 11
    assert len(goals) == 11
    assert np.var(mean_rewards) > 1e-2
    assert np.var(goals) > 1e-2
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               np.asarray(policy.get_param_values()),
                               env_update=tasks.sample(n_workers + 1))
Exemplo n.º 7
0
def test_obtain_exact_trajectories():
    max_path_length = 15
    n_workers = 8
    env = GarageEnv(PointEnv())
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_path_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policies, envs=env)
    n_traj_per_worker = 3
    rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker,
                                                 agent_update=policies)
    # At least one action per trajectory.
    assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker
    # All of the trajectories.
    assert len(rollouts.lengths) == n_workers * n_traj_per_worker
    worker = -1
    for count, rollout in enumerate(rollouts.split()):
        if count % n_traj_per_worker == 0:
            worker += 1
        assert (rollout.actions == per_worker_actions[worker]).all()
Exemplo n.º 8
0
    def test_local_batch_sampler(self):
        workers = WorkerFactory(seed=100,
                                max_path_length=self.algo.max_path_length)
        sampler1 = LocalSampler.from_worker_factory(workers, self.policy,
                                                    self.env)
        sampler2 = OnPolicyVectorizedSampler(self.algo, self.env)
        sampler2.start_worker()
        trajs1 = sampler1.obtain_samples(
            0, 1000, tuple(self.algo.policy.get_param_values()))
        trajs2 = sampler2.obtain_samples(0, 1000)
        # pylint: disable=superfluous-parens
        assert trajs1.observations.shape[0] >= 1000
        assert trajs1.actions.shape[0] >= 1000
        assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
            trajs2[0]['rewards']) == 1)

        true_obs = np.array([0, 1, 2, 6, 10, 14])
        true_actions = np.array([2, 2, 1, 1, 1, 2])
        true_rewards = np.array([0, 0, 0, 0, 0, 1])
        start = 0
        for length in trajs1.lengths:
            observations = trajs1.observations[start:start + length]
            actions = trajs1.actions[start:start + length]
            rewards = trajs1.rewards[start:start + length]
            assert np.array_equal(observations, true_obs)
            assert np.array_equal(actions, true_actions)
            assert np.array_equal(rewards, true_rewards)
            start += length
        sampler1.shutdown_worker()
        sampler2.shutdown_worker()
Exemplo n.º 9
0
    def evaluate(self, algo, test_episodes_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (MetaRLAlgorithm): The algorithm to evaluate.
            test_episodes_per_task (int or None): Number of episodes per task.

        """
        if test_episodes_per_task is None:
            test_episodes_per_task = self._n_test_episodes
        adapted_episodes = []
        logger.log('Sampling for adapation and meta-testing...')
        env_updates = self._test_task_sampler.sample(self._n_test_tasks)
        if self._test_sampler is None:
            env = env_updates[0]()
            self._max_episode_length = env.spec.max_episode_length
            self._test_sampler = LocalSampler.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_episode_length=self._max_episode_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=env)
        for env_up in env_updates:
            policy = algo.get_exploration_policy()
            eps = EpisodeBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_eps)
            ])
            adapted_policy = algo.adapt_policy(policy, eps)
            adapted_eps = self._test_sampler.obtain_samples(
                self._eval_itr,
                test_episodes_per_task * self._max_episode_length,
                adapted_policy)
            adapted_episodes.append(adapted_eps)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                EpisodeBatch.concatenate(*adapted_episodes),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1
Exemplo n.º 10
0
def test_init_with_env_updates(policy, envs):
    task_sampler = EnvPoolSampler(envs)
    envs = task_sampler.sample(N_TRAJ)
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_TRAJ,
                                 max_path_length=MAX_PATH_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=VecWorker,
                                worker_args=dict(n_envs=N_TRAJ),
                                max_path_length=MAX_PATH_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy],
                                                   [envs])
    n_samples = 100
    true_trajs = true_sampler.obtain_samples(0, n_samples, None)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None)

    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
Exemplo n.º 11
0
def test_no_seed():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    n_workers = 8
    workers = WorkerFactory(seed=None,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policy, env)
    episodes = sampler.obtain_samples(0, 160, policy)
    assert sum(episodes.lengths) >= 160
Exemplo n.º 12
0
def test_reset_optimization(policy, envs, other_envs):
    true_workers = WorkerFactory(seed=100,
                                 n_workers=N_TRAJ,
                                 max_path_length=MAX_PATH_LENGTH)
    true_sampler = LocalSampler.from_worker_factory(true_workers, policy, envs)
    vec_workers = WorkerFactory(seed=100,
                                n_workers=1,
                                worker_class=VecWorker,
                                worker_args=dict(n_envs=N_TRAJ),
                                max_path_length=MAX_PATH_LENGTH)
    vec_sampler = LocalSampler.from_worker_factory(vec_workers, [policy],
                                                   [envs])
    n_samples = 4 * MAX_PATH_LENGTH
    true_sampler.obtain_samples(0, n_samples, None)
    true_sampler.obtain_samples(0, n_samples, None)

    true_trajs = true_sampler.obtain_samples(0, n_samples, None, other_envs)
    vec_trajs = vec_sampler.obtain_samples(0, n_samples, None, [other_envs])

    assert vec_trajs.lengths.sum() >= n_samples
    assert_trajs_eq(true_trajs, vec_trajs)

    true_sampler.shutdown_worker()
    vec_sampler.shutdown_worker()
Exemplo n.º 13
0
def test_init_with_env_updates():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers,
                                               policy,
                                               envs=tasks.sample(n_workers))
    episodes = sampler.obtain_samples(0, 160, policy)
    assert sum(episodes.lengths) >= 160
Exemplo n.º 14
0
    def test_adapt_policy(self):
        """Test if policy can adapt to samples."""
        worker = WorkerFactory(seed=100, max_episode_length=100)
        sampler = LocalSampler.from_worker_factory(worker, self.policy,
                                                   self.env)

        self.policy.apply(partial(self._set_params, 0.1))
        adapt_policy = self.algo.get_exploration_policy()
        eps = sampler.obtain_samples(0, 100, adapt_policy)
        self.algo.adapt_policy(adapt_policy, eps)

        # Old policy should remain untouched
        self.policy.apply(partial(self._test_params, 0.1))

        # Adapted policy should not be identical to old policy
        for v1, v2 in zip(adapt_policy.parameters(), self.policy.parameters()):
            if v1.data.ne(v2.data).sum() > 0:
                break
        else:
            pytest.fail('Parameters of adapted policy should not be '
                        'identical to the old policy.')
Exemplo n.º 15
0
def test_obtain_samples(ray_local_session_fixture):
    del ray_local_session_fixture
    env = GarageEnv(GridWorldEnv(desc='4x4'))
    policy = ScriptedPolicy(
        scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
    algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16)

    assert ray.is_initialized()
    workers = WorkerFactory(seed=100,
                            max_path_length=algo.max_path_length,
                            n_workers=8)
    sampler1 = RaySampler.from_worker_factory(workers, policy, env)
    sampler2 = LocalSampler.from_worker_factory(workers, policy, env)
    trajs1 = sampler1.obtain_samples(0, 1000,
                                     tuple(algo.policy.get_param_values()))
    trajs2 = sampler2.obtain_samples(0, 1000,
                                     tuple(algo.policy.get_param_values()))

    assert trajs1.observations.shape[0] >= 1000
    assert trajs1.actions.shape[0] >= 1000
    assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
        trajs2.rewards[:trajs2.lengths[0]]) == 1)

    true_obs = np.array([0, 1, 2, 6, 10, 14])
    true_actions = np.array([2, 2, 1, 1, 1, 2])
    true_rewards = np.array([0, 0, 0, 0, 0, 1])
    start = 0
    for length in trajs1.lengths:
        observations = trajs1.observations[start:start + length]
        actions = trajs1.actions[start:start + length]
        rewards = trajs1.rewards[start:start + length]
        assert np.array_equal(observations, true_obs)
        assert np.array_equal(actions, true_actions)
        assert np.array_equal(rewards, true_rewards)
        start += length
    sampler1.shutdown_worker()
    sampler2.shutdown_worker()
    env.close()
Exemplo n.º 16
0
def test_update_envs_env_update(timesteps_per_call):
    max_episode_length = 16
    env = PointEnv()
    n_workers = 8
    policies = [
        FixedPolicy(env.spec,
                    scripted_actions=[
                        env.action_space.sample()
                        for _ in range(max_episode_length)
                    ]) for _ in range(n_workers)
    ]
    tasks = SetTaskSampler(PointEnv)
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers,
                            worker_class=FragmentWorker,
                            worker_args=dict(
                                n_envs=1,
                                timesteps_per_call=timesteps_per_call))
    sampler = LocalSampler.from_worker_factory(workers, policies, env)
    episodes = sampler.obtain_samples(0,
                                      160,
                                      None,
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for eps in episodes.split():
        mean_rewards.append(eps.rewards.mean())
        goals.append(eps.env_infos['task'][0]['goal'])
    assert len(mean_rewards) == int(160 / timesteps_per_call)
    assert len(goals) == int(160 / timesteps_per_call)
    assert np.var(mean_rewards) > 1e-2
    assert np.var(goals) > 1e-2
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               None,
                               env_update=tasks.sample(n_workers + 1))
Exemplo n.º 17
0
def test_obtain_samples():
    env = GarageEnv(GridWorldEnv(desc='4x4'))
    policy = ScriptedPolicy(
        scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
    algo = Mock(env_spec=env.spec, policy=policy, max_episode_length=16)

    workers = WorkerFactory(seed=100,
                            max_episode_length=algo.max_episode_length,
                            n_workers=8)
    sampler1 = MultiprocessingSampler.from_worker_factory(workers, policy, env)
    sampler2 = LocalSampler.from_worker_factory(workers, policy, env)
    trajs1 = sampler1.obtain_samples(0, 1000,
                                     tuple(algo.policy.get_param_values()))
    trajs2 = sampler2.obtain_samples(0, 1000,
                                     tuple(algo.policy.get_param_values()))
    # pylint: disable=superfluous-parens
    assert trajs1.observations.shape[0] >= 1000
    assert trajs1.actions.shape[0] >= 1000
    assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
        trajs2.rewards[:trajs2.lengths[0]]) == 1)

    true_obs = np.array([0, 1, 2, 6, 10, 14])
    true_actions = np.array([2, 2, 1, 1, 1, 2])
    true_rewards = np.array([0, 0, 0, 0, 0, 1])
    start = 0
    for length in trajs1.lengths:
        observations = trajs1.observations[start:start + length]
        actions = trajs1.actions[start:start + length]
        rewards = trajs1.rewards[start:start + length]
        assert np.array_equal(observations, true_obs)
        assert np.array_equal(actions, true_actions)
        assert np.array_equal(rewards, true_rewards)
        start += length
    sampler1.shutdown_worker()
    sampler2.shutdown_worker()
    env.close()