Пример #1
0
    def _evaluate(self, policy_opt_input_values, samples_data):
        """Evaluate rewards and everything else.

        Args:
            policy_opt_input_values (list[np.ndarray]): Flattened
                policy optimization input values.
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        Returns:
            dict: Processed sample data.

        """
        # pylint: disable=too-many-statements
        # Augment reward from baselines
        rewards_tensor = self._f_rewards(*policy_opt_input_values)
        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        paths = samples_data['paths']
        valids = samples_data['valids']
        baselines = [path['baselines'] for path in paths]
        env_rewards = [path['rewards'] for path in paths]
        env_rewards = concat_tensor_list(env_rewards.copy())
        env_returns = [path['returns'] for path in paths]
        env_returns = concat_tensor_list(env_returns.copy())
        env_average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path['rewards'] = rew[val.astype(np.bool)]
            path['returns'] = ret[val.astype(np.bool)]
            aug_rewards.append(path['rewards'])
            aug_returns.append(path['returns'])
        aug_rewards = concat_tensor_list(aug_rewards)
        aug_returns = concat_tensor_list(aug_returns)
        samples_data['rewards'] = aug_rewards
        samples_data['returns'] = aug_returns

        # Calculate effect of the entropy terms
        d_rewards = np.mean(aug_rewards - env_rewards)
        tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards)

        aug_average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))
        d_returns = np.mean(aug_average_discounted_return -
                            env_average_discounted_return)
        tabular.record('{}/EntReturns'.format(self.policy.name), d_returns)

        # Calculate explained variance
        ev = explained_variance_1d(np.concatenate(baselines), aug_returns)
        tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev)

        inference_rmse = (samples_data['trajectory_infos']['mean'] -
                          samples_data['latents'])**2.
        inference_rmse = np.sqrt(inference_rmse.mean())
        tabular.record('Inference/RMSE', inference_rmse)

        inference_rrse = rrse(samples_data['latents'],
                              samples_data['trajectory_infos']['mean'])
        tabular.record('Inference/RRSE', inference_rrse)

        embed_ent = self._f_encoder_entropy(*policy_opt_input_values)
        tabular.record('{}/Encoder/Entropy'.format(self.policy.name),
                       embed_ent)

        infer_ce = self._f_inference_ce(*policy_opt_input_values)
        tabular.record('Inference/CrossEntropy', infer_ce)

        pol_ent = self._f_policy_entropy(*policy_opt_input_values)
        pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids'])
        tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)

        task_ents = self._f_task_entropies(*policy_opt_input_values)
        tasks = samples_data['tasks'][:, 0, :]
        _, task_indices = np.nonzero(tasks)
        path_lengths = np.sum(samples_data['valids'], axis=1)
        for t in range(self.policy.task_space.flat_dim):
            lengths = path_lengths[task_indices == t]
            completed = lengths < self.max_episode_length
            pct_completed = np.mean(completed)
            tabular.record('Tasks/EpisodeLength/t={}'.format(t),
                           np.mean(lengths))
            tabular.record('Tasks/TerminationRate/t={}'.format(t),
                           pct_completed)
            tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t])

        return samples_data
Пример #2
0
    def _evaluate(self, policy_opt_input_values, episodes, baselines,
                  embed_ep_infos):
        """Evaluate rewards and everything else.

        Args:
            policy_opt_input_values (list[np.ndarray]): Flattened
                policy optimization input values.
            episodes (EpisodeBatch): Batch of episodes.
            baselines (np.ndarray): Baseline predictions.
            embed_ep_infos (dict): Embedding distribution information.

        Returns:
            dict: Paths for fitting the baseline.

        """
        # pylint: disable=too-many-statements
        fit_paths = []
        valids = episodes.valids
        observations = episodes.padded_observations
        tasks = pad_batch_array(episodes.env_infos['task_onehot'],
                                episodes.lengths, self.max_episode_length)
        latents = pad_batch_array(episodes.agent_infos['latent'],
                                  episodes.lengths, self.max_episode_length)
        baselines_list = []
        for baseline, valid in zip(baselines, valids):
            baselines_list.append(baseline[valid.astype(np.bool)])

        # Augment reward from baselines
        rewards_tensor = self._f_rewards(*policy_opt_input_values)
        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        env_rewards = episodes.rewards
        env_returns = [
            discount_cumsum(rwd, self._discount)
            for rwd in episodes.padded_rewards
        ]
        env_average_discounted_return = np.mean(
            [ret[0] for ret in env_returns])

        # Recompute returns and prepare paths for fitting the baseline
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, task, latent, obs in zip(rewards_tensor,
                                                    returns_tensor, valids,
                                                    tasks, latents,
                                                    observations):
            returns = ret[val.astype(np.bool)]
            task = task[val.astype(np.bool)]
            latent = latent[val.astype(np.bool)]
            obs = obs[val.astype(np.bool)]

            aug_rewards.append(rew[val.astype(np.bool)])
            aug_returns.append(returns)
            fit_paths.append(
                dict(observations=obs,
                     tasks=task,
                     latents=latent,
                     returns=returns))
        aug_rewards = concat_tensor_list(aug_rewards)
        aug_returns = concat_tensor_list(aug_returns)

        # Calculate effect of the entropy terms
        d_rewards = np.mean(aug_rewards - env_rewards)
        tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards)

        aug_average_discounted_return = (np.mean(
            [ret[0] for ret in returns_tensor]))
        d_returns = np.mean(aug_average_discounted_return -
                            env_average_discounted_return)
        tabular.record('{}/EntReturns'.format(self.policy.name), d_returns)

        # Calculate explained variance
        ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns)
        tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev)

        inference_rmse = (embed_ep_infos['mean'] - latents)**2.
        inference_rmse = np.sqrt(inference_rmse.mean())
        tabular.record('Inference/RMSE', inference_rmse)

        inference_rrse = rrse(latents, embed_ep_infos['mean'])
        tabular.record('Inference/RRSE', inference_rrse)

        embed_ent = self._f_encoder_entropy(*policy_opt_input_values)
        tabular.record('{}/Encoder/Entropy'.format(self.policy.name),
                       embed_ent)

        infer_ce = self._f_inference_ce(*policy_opt_input_values)
        tabular.record('Inference/CrossEntropy', infer_ce)

        pol_ent = self._f_policy_entropy(*policy_opt_input_values)
        pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths)
        tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)

        task_ents = self._f_task_entropies(*policy_opt_input_values)
        tasks = tasks[:, 0, :]
        _, task_indices = np.nonzero(tasks)
        path_lengths = np.sum(valids, axis=1)
        for t in range(self.policy.task_space.flat_dim):
            lengths = path_lengths[task_indices == t]
            completed = lengths < self.max_episode_length
            pct_completed = np.mean(completed)
            tabular.record('Tasks/EpisodeLength/t={}'.format(t),
                           np.mean(lengths))
            tabular.record('Tasks/TerminationRate/t={}'.format(t),
                           pct_completed)
            tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t])

        return fit_paths