Пример #1
0
def test_act_box_env_spec_mismatch_eps(eps_data):
    with pytest.raises(ValueError, match='actions should have'):
        eps_data['env_spec'].action_space = akro.Box(low=1,
                                                     high=np.inf,
                                                     shape=(4, 3, 2),
                                                     dtype=np.float32)
        t = EpisodeBatch(**eps_data)
        del t
Пример #2
0
def test_agent_infos_batch_mismatch_eps(eps_data):
    with pytest.raises(
            ValueError,
            match='entry in agent_infos must have a batch dimension'):
        eps_data['agent_infos']['hidden'] = eps_data['agent_infos'][
            'hidden'][:-1]
        t = EpisodeBatch(**eps_data)
        del t
Пример #3
0
def test_agent_infos_batch_mismatch_eps(eps_data):
    with pytest.raises(
            ValueError,
            match="Entry 'hidden' in agent_infos has batch size 141"):
        eps_data['agent_infos']['hidden'] = eps_data['agent_infos'][
            'hidden'][:-1]
        t = EpisodeBatch(**eps_data)
        del t
Пример #4
0
def test_to_epsbatch_list(eps_data):
    t = EpisodeBatch(**eps_data)
    t_list = t.to_list()
    assert len(t_list) == len(eps_data['lengths'])
    start = 0
    for length, last_obs, s in zip(eps_data['lengths'],
                                   eps_data['last_observations'], t_list):
        stop = start + length
        assert (
            s['observations'] == eps_data['observations'][start:stop]).all()
        assert (s['next_observations'] == np.concatenate(
            (eps_data['observations'][start + 1:stop], [last_obs]))).all()
        assert (s['actions'] == eps_data['actions'][start:stop]).all()
        assert (s['rewards'] == eps_data['rewards'][start:stop]).all()
        assert (s['step_types'] == eps_data['step_types'][start:stop]).all()
        start = stop
    assert start == len(eps_data['rewards'])
Пример #5
0
def test_last_obs_env_spec_mismatch_eps(eps_data):
    with pytest.raises(ValueError,
                       match=('last_observations must have the '
                              'same number of entries')):
        eps_data['last_observations'] = \
                eps_data['last_observations'][:, :, :, :1]
        t = EpisodeBatch(**eps_data)
        del t
Пример #6
0
def test_time_step_batch_from_episode_batch(eps_data):
    eps = EpisodeBatch(**eps_data)
    timestep_batch = TimeStepBatch.from_episode_batch(eps)
    assert (timestep_batch.observations == eps.observations).all()
    assert (timestep_batch.next_observations[:eps.lengths[0] - 1] ==
            eps.observations[1:eps.lengths[0]]).all()
    assert (timestep_batch.next_observations[eps.lengths[0]] ==
            eps.last_observations[0]).all()
Пример #7
0
    def _process_samples(self, itr, episodes):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            episodes (EpisodeBatch): Original collected episode batch for each
                task. For each episode, episode.agent_infos['batch_idx']
                indicates which task this episode belongs to. In RL^2, there
                are n environments/tasks and paths in each of them will be
                concatenated at some point and fed to the policy.

        Returns:
            EpisodeBatch: Processed batch of episodes for feeding the inner
                algorithm.
            numpy.float64: The average return.

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for episode in episodes.split():
            if hasattr(episode, 'batch_idx'):
                paths_by_task[episode.batch_idx[0]].append(episode)
            elif 'batch_idx' in episode.agent_infos:
                paths_by_task[episode.agent_infos['batch_idx'][0]].append(
                    episode)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use garage.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for episode_list in paths_by_task.values():
            concatenated_path = self._concatenate_episodes(episode_list)
            concatenated_paths.append(concatenated_path)

        concatenated_episodes = EpisodeBatch.concatenate(*concatenated_paths)

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0]._env, 'all_task_names'):
            names = [
                env._env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr, episodes, self._inner_algo._discount, name_map=name_map)

        average_return = np.mean(undiscounted_returns)

        return concatenated_episodes, average_return
Пример #8
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        # -- Stage: Calculate baseline
        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_episode_length,
                                        baseline_predictions, self._discount)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(itr,
                                               EpisodeBatch.from_list(
                                                   self._env_spec, paths),
                                               discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        samples_data['average_return'] = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = samples_data['average_return']
        self._all_returns.append(samples_data['average_return'])

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
Пример #9
0
def log_multitask_performance(itr, batch, discount, name_map=None):
    r"""Log performance of episodes from multiple tasks.

    Args:
        itr (int): Iteration number to be logged.
        batch (EpisodeBatch): Batch of episodes. The episodes should have
            either the "task_name" or "task_id" `env_infos`. If the "task_name"
            is not present, then `name_map` is required, and should map from
            task id's to task names.
        discount (float): Discount used in computing returns.
        name_map (dict[int, str] or None): Mapping from task id's to task
            names. Optional if the "task_name" environment info is present.
            Note that if provided, all tasks listed in this map will be logged,
            even if there are no episodes present for them.

    Returns:
        numpy.ndarray: Undiscounted returns averaged across all tasks. Has
            shape :math:`(N \bullet [T])`.

    """
    eps_by_name = defaultdict(list)
    for eps in batch.split():
        task_name = '__unnamed_task__'
        if 'task_name' in eps.env_infos:
            task_name = eps.env_infos['task_name'][0]
        elif 'task_id' in eps.env_infos:
            name_map = {} if name_map is None else name_map
            task_id = eps.env_infos['task_id'][0]
            task_name = name_map.get(task_id, 'Task #{}'.format(task_id))
        eps_by_name[task_name].append(eps)
    if name_map is None:
        task_names = eps_by_name.keys()
    else:
        task_names = name_map.values()
    for task_name in task_names:
        if task_name in eps_by_name:
            episodes = eps_by_name[task_name]
            log_performance(itr,
                            EpisodeBatch.concatenate(*episodes),
                            discount,
                            prefix=task_name)
        else:
            with tabular.prefix(task_name + '/'):
                tabular.record('Iteration', itr)
                tabular.record('NumEpisodes', 0)
                tabular.record('AverageDiscountedReturn', np.nan)
                tabular.record('AverageReturn', np.nan)
                tabular.record('StdReturn', np.nan)
                tabular.record('MaxReturn', np.nan)
                tabular.record('MinReturn', np.nan)
                tabular.record('TerminationRate', np.nan)
                tabular.record('SuccessRate', np.nan)

    return log_performance(itr, batch, discount=discount, prefix='Average')
Пример #10
0
def test_new_eps(eps_data):
    t = EpisodeBatch(**eps_data)
    assert t.env_spec is eps_data['env_spec']
    assert t.observations is eps_data['observations']
    assert t.last_observations is eps_data['last_observations']
    assert t.actions is eps_data['actions']
    assert t.rewards is eps_data['rewards']
    assert t.env_infos is eps_data['env_infos']
    assert t.agent_infos is eps_data['agent_infos']
    assert t.step_types is eps_data['step_types']
    assert t.lengths is eps_data['lengths']
Пример #11
0
    def _train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        # -- Stage: Calculate baseline
        paths = [
            dict(
                observations=path['observations'],
                actions=(
                    self._env_spec.action_space.flatten_n(  # noqa: E126
                        path['actions'])),
                rewards=path['rewards'],
                env_infos=path['env_infos'],
                agent_infos=path['agent_infos'],
                dones=np.array([
                    step_type == StepType.TERMINAL
                    for step_type in path['step_types']
                ])) for path in paths
        ]

        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_episode_length,
                                        baseline_predictions, self._discount,
                                        self._gae_lambda)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            EpisodeBatch.from_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))

        samples_data['average_return'] = np.mean(undiscounted_returns)

        logger.log('Optimizing policy...')
        self._optimize_policy(samples_data)

        return samples_data['average_return']
Пример #12
0
def test_episodes_to_acts_obs_list(eps_data):
    t = EpisodeBatch(**eps_data)
    acts_list = t.actions_list
    obs_list = t.observations_list
    start = 0
    assert len(acts_list) == len(t.lengths)
    assert len(obs_list) == len(t.lengths)
    for i, length in enumerate(t.lengths):
        stop = start + length
        assert (acts_list[i] == t.actions[start:stop]).all()
        assert (obs_list[i] == t.observations[start:stop]).all()
        start = stop
Пример #13
0
    def collect_episode(self):
        # print('called')
        """Collect the current episode, clearing the internal buffer.

        Returns:
            EpisodeBatch: A batch of the episodes completed since the last call
                to collect_episode().

        """
        observations = self._observations
        self._observations = []
        last_observations = self._last_observations
        self._last_observations = []

        actions = []
        rewards = []
        env_infos = defaultdict(list)
        step_types = []

        for es in self._env_steps:
            actions.append(es.action)
            rewards.append(es.reward)
            step_types.append(es.step_type)
            for k, v in es.env_info.items():
                env_infos[k].append(v)
        self._env_steps = []

        agent_infos = self._agent_infos
        self._agent_infos = defaultdict(list)
        for k, v in agent_infos.items():
            agent_infos[k] = np.asarray(v)

        for k, v in env_infos.items():
            env_infos[k] = np.asarray(v)

        episode_infos = self._episode_infos
        self._episode_infos = defaultdict(list)
        for k, v in episode_infos.items():
            episode_infos[k] = np.asarray(v)

        lengths = self._lengths
        self._lengths = []
        # print(f'OBSERVATIONS {len(observations)}, LAST OBSERVATIONS: {len(last_observations)}')
        return EpisodeBatch(env_spec=self.env.spec,
                            episode_infos=episode_infos,
                            observations=np.asarray(observations),
                            last_observations=np.asarray(last_observations),
                            actions=np.asarray(actions),
                            rewards=np.asarray(rewards),
                            step_types=np.asarray(step_types, dtype=StepType),
                            env_infos=dict(env_infos),
                            agent_infos=dict(agent_infos),
                            lengths=np.asarray(lengths, dtype='i'))
Пример #14
0
def test_log_multitask_performance_task_id():
    lengths = np.array([10, 5, 1, 1])
    batch = EpisodeBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        step_types=np.array([StepType.MID] * sum(lengths), dtype=StepType),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool),
            'task_id':
            np.array([1] * 10 + [3] * 5 + [1] + [4])
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_multitask_performance(7, batch, 0.8, {
        1: 'env1',
        3: 'env2',
        4: 'env3',
        5: 'env4'
    })
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['env1/Iteration'] == 7
    assert res['env2/Iteration'] == 7
    assert res['env3/Iteration'] == 7
    assert res['env4/Iteration'] == 7
    assert res['env1/NumEpisodes'] == 2
    assert res['env2/NumEpisodes'] == 1
    assert res['env3/NumEpisodes'] == 1
    assert res['env4/NumEpisodes'] == 0
    assert math.isclose(res['env1/SuccessRate'], 0.5)
    assert math.isclose(res['env2/SuccessRate'], 1.0)
    assert math.isclose(res['env3/SuccessRate'], 1.0)
    assert math.isnan(res['env4/SuccessRate'])
    assert math.isnan(res['env4/AverageReturn'])
Пример #15
0
    def train(self, runner):
        """Get samples and train the policy.

        Args:
            runner (LocalRunner): LocalRunner.

        """
        for epoch in runner.step_epochs():
            samples = runner.obtain_samples(epoch)
            log_performance(epoch,
                            EpisodeBatch.from_list(self.env_spec, samples),
                            self._discount)
            self._train_once(epoch, samples)
Пример #16
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): Experiment runner.

        """
        for epoch in runner.step_epochs():
            samples = runner.obtain_samples(epoch)
            log_performance(epoch,
                            EpisodeBatch.from_list(self.env_spec, samples),
                            self._discount)
            self._train_once(samples)
Пример #17
0
    def collect_episode(self):
        """Collect all completed episodes.

        Returns:
            EpisodeBatch: A batch of the episodes completed since the last call
                to collect_episode().

        """
        if len(self._completed_episodes) == 1:
            result = self._completed_episodes[0]
        else:
            result = EpisodeBatch.concatenate(*self._completed_episodes)
        self._completed_episodes = []
        return result
Пример #18
0
def test_new_eps(eps_data):
    t = EpisodeBatch(**eps_data)
    assert t.env_spec is eps_data['env_spec']
    assert t.observations is eps_data['observations']
    assert t.last_observations is eps_data['last_observations']
    assert t.actions is eps_data['actions']
    assert t.rewards is eps_data['rewards']
    assert t.env_infos is eps_data['env_infos']
    assert t.agent_infos is eps_data['agent_infos']
    assert t.step_types is eps_data['step_types']
    assert t.lengths is eps_data['lengths']
    assert t.episode_infos_by_episode is eps_data['episode_infos']
    assert (t.episode_infos['task_one_hot'][0].shape ==
            eps_data['episode_infos']['task_one_hot'][0].shape)
Пример #19
0
def test_log_performance():
    lengths = np.array([10, 5, 1, 1])
    batch = EpisodeBatch(
        EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])),
                akro.Box(np.array([-1., -1.]), np.array([0., 0.]))),
        observations=np.ones((sum(lengths), 3), dtype=np.float32),
        last_observations=np.ones((len(lengths), 3), dtype=np.float32),
        actions=np.zeros((sum(lengths), 2), dtype=np.float32),
        rewards=np.array([
            0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901,
            0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933,
            0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551,
            0.24203526, 0.43328910
        ]),
        step_types=np.array(
            [StepType.FIRST] + [StepType.MID] * (lengths[0] - 2) +
            [StepType.TERMINAL] + [StepType.FIRST] + [StepType.MID] *
            (lengths[1] - 2) + [StepType.TERMINAL] + [StepType.FIRST] +
            [StepType.FIRST],
            dtype=StepType),
        env_infos={
            'success':
            np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     dtype=bool)
        },
        agent_infos={},
        lengths=lengths)

    log_file = tempfile.NamedTemporaryFile()
    csv_output = dowel.CsvOutput(log_file.name)
    logger.add_output(csv_output)
    log_performance(7, batch, 0.8, prefix='test_log_performance')
    logger.log(tabular)
    logger.dump_output_type(dowel.CsvOutput)
    with open(log_file.name, 'r') as file:
        rows = list(csv.DictReader(file))
    res = {k: float(r) for (k, r) in rows[0].items()}
    assert res['test_log_performance/Iteration'] == 7
    assert res['test_log_performance/NumEpisodes'] == 4
    assert math.isclose(res['test_log_performance/SuccessRate'], 0.75)
    assert math.isclose(res['test_log_performance/TerminationRate'], 0.5)
    assert math.isclose(res['test_log_performance/AverageDiscountedReturn'],
                        1.1131040640673113)
    assert math.isclose(res['test_log_performance/AverageReturn'],
                        2.1659965525)
    assert math.isclose(res['test_log_performance/StdReturn'],
                        2.354067152038576)
Пример #20
0
    def _concatenate_episodes(self, episode_list):
        """Concatenate episodes.

        The input list contains samples from different episodes but same
        task/environment. In RL^2, paths within each meta batch are all
        concatenate into a single path and fed to the policy.

        Args:
            episode_list (list[EpisodeBatch]): Input paths. All paths are from
                different episodes, but the same task/environment.

        Returns:
            EpisodeBatch: Concatenated episode from the same task/environment.
                Shape of values: :math:`[max_episode_length * episode_per_task,
                S^*]`

        """
        env_infos = {
            k: np.concatenate([b.env_infos[k] for b in episode_list])
            for k in episode_list[0].env_infos.keys()
        }
        agent_infos = {
            k: np.concatenate([b.agent_infos[k] for b in episode_list])
            for k in episode_list[0].agent_infos.keys()
        }
        episode_infos = {
            k: np.concatenate([b.episode_infos[k] for b in episode_list])
            for k in episode_list[0].episode_infos.keys()
        }
        actions = np.concatenate([
            self._env_spec.action_space.flatten_n(ep.actions)
            for ep in episode_list
        ])

        return EpisodeBatch(
            env_spec=episode_list[0].env_spec,
            episode_infos=episode_infos,
            observations=np.concatenate(
                [ep.observations for ep in episode_list]),
            last_observations=episode_list[-1].last_observations,
            actions=actions,
            rewards=np.concatenate([ep.rewards for ep in episode_list]),
            env_infos=env_infos,
            agent_infos=agent_infos,
            step_types=np.concatenate([ep.step_types for ep in episode_list]),
            lengths=np.asarray([sum([ep.lengths[0] for ep in episode_list])]))
Пример #21
0
    def _log_performance(self, itr, all_samples, loss_before, loss_after,
                         kl_before, kl, policy_entropy):
        """Evaluate performance of this batch.

        Args:
            itr (int): Iteration number.
            all_samples (list[list[_MAMLEpisodeBatch]]): Two
                dimensional list of _MAMLEpisodeBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        tabular.record('Iteration', itr)

        name_map = None
        if hasattr(self._env, 'all_task_names'):
            names = self._env.all_task_names
            name_map = dict(zip(names, names))

        rtns = log_multitask_performance(
            itr,
            EpisodeBatch.from_list(
                env_spec=self._env.spec,
                paths=[
                    path for task_paths in all_samples
                    for path in task_paths[self._num_grad_updates].paths
                ]),
            discount=self._inner_algo.discount,
            name_map=name_map)

        with tabular.prefix(self._policy.name + '/'):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KLAfter', kl)
            tabular.record('Entropy', policy_entropy)

        return np.mean(rtns)
Пример #22
0
    def collect_episode(self):
        """Gather fragments from all in-progress episodes.

        Returns:
            EpisodeBatch: A batch of the episode fragments.

        """
        for i, frag in enumerate(self._fragments):
            assert frag.env is self._envs[i]
            if len(frag.rewards) > 0:
                complete_frag = frag.to_batch()
                self._complete_fragments.append(complete_frag)
                self._fragments[i] = InProgressEpisode(frag.env, frag.last_obs)
        assert len(self._complete_fragments) > 0
        result = EpisodeBatch.concatenate(*self._complete_fragments)
        self._complete_fragments = []
        return result
Пример #23
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        undiscounted_returns = log_performance(itr,
                                               EpisodeBatch.from_list(
                                                   self._env_spec, paths),
                                               discount=self._discount)

        samples_data = self.paths_to_tensors(paths)

        samples_data['average_return'] = np.mean(undiscounted_returns)

        logger.log('Optimizing policy...')
        self.optimize_policy(itr, samples_data)
        return samples_data['average_return']
Пример #24
0
    def _obtain_samples(self, trainer, epoch):
        """Obtain samples from self._source.

        Args:
            trainer (Trainer): Experiment trainer, which may be used to
                obtain samples.
            epoch (int): The current epoch.

        Returns:
            TimeStepBatch: Batch of samples.

        """
        if isinstance(self._source, Policy):
            batch = EpisodeBatch.from_list(self._env_spec,
                                           trainer.obtain_samples(epoch))
            log_performance(epoch, batch, 1.0, prefix='Expert')
            return batch
        else:
            batches = []
            while (sum(len(batch.actions)
                       for batch in batches) < self._batch_size):
                batches.append(next(self._source))
            return TimeStepBatch.concatenate(*batches)
Пример #25
0
def slice_episodes(episodes, slice_size):
    sliced = []
    for eps in episodes.split():
        splits = math.ceil(eps.lengths[0] / slice_size)
        split_indices = np.array_split(np.arange(eps.lengths[0]), splits)
        next_obs = eps.next_observations
        for indices in split_indices:
            last_obs = np.asarray([next_obs[indices[-1]]])
            t = EpisodeBatch(
                env_spec=eps.env_spec,
                observations=eps.observations[indices],
                last_observations=last_obs,
                actions=eps.actions[indices],
                rewards=eps.rewards[indices],
                step_types=eps.step_types[indices],
                env_infos={k: v[indices]
                           for (k, v) in eps.env_infos.items()},
                agent_infos={
                    k: v[indices]
                    for (k, v) in eps.agent_infos.items()
                },
                lengths=np.asarray([len(indices)], dtype='l'))
            sliced.append(t)
    return sliced
Пример #26
0
    def _gather_episode(self, episode_number, last_observation):
        assert 0 < self._episode_lengths[
            episode_number] <= self._max_episode_length
        env_infos = self._env_infos[episode_number]
        agent_infos = self._agent_infos[episode_number]
        episode_infos = self._episode_infos[episode_number]
        for k, v in env_infos.items():
            env_infos[k] = np.asarray(v)
        for k, v in agent_infos.items():
            agent_infos[k] = np.asarray(v)
        for k, v in episode_infos.items():
            episode_infos[k] = np.asarray(v)
        eps = EpisodeBatch(
            env_spec=self._envs[episode_number].spec,
            episode_infos=dict(episode_infos),
            observations=np.asarray(self._observations[episode_number]),
            last_observations=np.asarray([last_observation]),
            actions=np.asarray(self._actions[episode_number]),
            rewards=np.asarray(self._rewards[episode_number]),
            step_types=np.asarray(self._step_types[episode_number],
                                  dtype=StepType),
            env_infos=dict(env_infos),
            agent_infos=dict(agent_infos),
            lengths=np.asarray([self._episode_lengths[episode_number]],
                               dtype='l'))

        self._completed_episodes.append(eps)
        self._observations[episode_number] = []
        self._actions[episode_number] = []
        self._rewards[episode_number] = []
        self._step_types[episode_number] = []
        self._episode_lengths[episode_number] = 0
        self._prev_obs[episode_number] = self._envs[episode_number].reset()[0]
        self._env_infos[episode_number] = collections.defaultdict(list)
        self._agent_infos[episode_number] = collections.defaultdict(list)
        self._episode_infos[episode_number] = collections.defaultdict(list)
Пример #27
0
def test_episodes_padding_tensors(eps_data):
    t = EpisodeBatch(**eps_data)
    N = len(t.lengths)
    max_ep_l = t.env_spec.max_episode_length

    observations = t.padded_observations
    actions = t.padded_actions
    rewards = t.padded_rewards
    valids = t.valids
    agent_infos = t.padded_agent_infos

    assert observations.shape == (N, max_ep_l, *t.observations[0].shape)
    assert actions.shape == (N, max_ep_l, *t.actions[0].shape)
    assert rewards.shape == (N, max_ep_l)
    assert valids.shape == (N, max_ep_l)
    assert agent_infos.keys() == t.agent_infos.keys()
    for key in agent_infos.keys():
        assert agent_infos[key].shape == (N, max_ep_l,
                                          *t.agent_infos[key][0].shape)

    start = 0
    for i, length in enumerate(t.lengths):
        stop = start + length
        assert (observations[i][:length] == t.observations[start:stop]).all()
        assert np.count_nonzero(observations[i][length:]) == 0
        assert (actions[i][:length] == t.actions[start:stop]).all()
        assert np.count_nonzero(actions[i][length:]) == 0
        assert (rewards[i][:length] == t.rewards[start:stop]).all()
        assert np.count_nonzero(rewards[i][length:]) == 0
        assert (valids[i][:length] == np.ones((length, ))).all()
        assert np.count_nonzero(valids[i][length:]) == 0
        for key in agent_infos.keys():
            assert (agent_infos[key][i][:length] == t.agent_infos[key]
                    [start:stop]).all()
            assert np.count_nonzero(agent_infos[key][i][length:]) == 0
        start = stop
Пример #28
0
    def obtain_samples(self, num_samples, agent_update, env_updates=None):
        """Sample the policy for new episodes.

        Args:
            num_samples (int): Number of steps the the sampler should collect.
            agent_update (object): Value which will be passed into the
                `agent_update_fn` before sampling episodes. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.
            env_updates (object): Value which will be passed into the
                `env_update_fn` before sampling episodes. If a list is passed
                in, it must have length exactly `factory.n_workers`, and will
                be spread across the workers.

        Returns:
            EpisodeBatch: Batch of gathered episodes.

        """

        self.update_workers(agent_update, env_updates)
        completed_samples = 0
        batches = []

        # TODO: can we replace the while, so all processes are scheduled beforehand?
        while completed_samples < num_samples:
            pids = [w.rollout.remote() for w in self.workers]
            results = [ray.get(pid) for pid in pids]
            for episode_batch in results:
                num_returned_samples = episode_batch.lengths.sum()
                completed_samples += num_returned_samples
                batches.append(episode_batch)

        # Note: EpisodeBatch takes care of concatenating - is this a performance issue?
        samples = EpisodeBatch.concatenate(*batches)
        self.total_env_steps += sum(samples.lengths)
        return samples
Пример #29
0
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Calculated mean value of undiscounted returns.

        """
        obs, actions, rewards, returns, valids, baselines = \
            self.process_samples(paths)

        if self._maximum_entropy:
            policy_entropies = self._compute_policy_entropy(obs)
            rewards += self._policy_ent_coeff * policy_entropies

        obs_flat = torch.cat(filter_valids(obs, valids))
        actions_flat = torch.cat(filter_valids(actions, valids))
        rewards_flat = torch.cat(filter_valids(rewards, valids))
        returns_flat = torch.cat(filter_valids(returns, valids))
        advs_flat = self._compute_advantage(rewards, valids, baselines)

        with torch.no_grad():
            policy_loss_before = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_before = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_before = self._compute_kl_constraint(obs)

        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
                    advs_flat)

        with torch.no_grad():
            policy_loss_after = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_after = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_after = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        with tabular.prefix(self.policy.name):
            tabular.record('/LossBefore', policy_loss_before.item())
            tabular.record('/LossAfter', policy_loss_after.item())
            tabular.record('/dLoss',
                           (policy_loss_before - policy_loss_after).item())
            tabular.record('/KLBefore', kl_before.item())
            tabular.record('/KL', kl_after.item())
            tabular.record('/Entropy', policy_entropy.mean().item())

        with tabular.prefix(self._value_function.name):
            tabular.record('/LossBefore', vf_loss_before.item())
            tabular.record('/LossAfter', vf_loss_after.item())
            tabular.record('/dLoss',
                           vf_loss_before.item() - vf_loss_after.item())

        self._old_policy.load_state_dict(self.policy.state_dict())

        undiscounted_returns = log_performance(itr,
                                               EpisodeBatch.from_list(
                                                   self._env_spec, paths),
                                               discount=self.discount)
        return np.mean(undiscounted_returns)
Пример #30
0
 def test_episode_batch_to_timestep_batch(self, eps_data):
     t = EpisodeBatch(**eps_data)
     replay_buffer = PathBuffer(capacity_in_transitions=100)
     replay_buffer.add_episode_batch(t)
     timesteps = replay_buffer.sample_timesteps(10)
     assert len(timesteps.rewards) == 10