Пример #1
0
    def _train_once(self, epoch, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            epoch (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return of epoch cycle.

        """
        returns = []
        for path in paths:
            returns.append(discount_cumsum(path['rewards'], self._discount))
        avg_return = np.mean(np.concatenate(returns))
        self._all_avg_returns.append(avg_return)

        if (epoch + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_avg_returns)
            best_inds = np.argsort(-avg_rtns)[:self._n_best]
            best_params = np.array(self._all_params)[best_inds]
            self._cur_mean = best_params.mean(axis=0)
            self._cur_std = best_params.std(axis=0)
            self.policy.set_param_values(self._cur_mean)
            avg_return = max(self._all_avg_returns)
            self._all_avg_returns.clear()
            self._all_params.clear()

        self._cur_params = self._sample_params(epoch)
        self._all_params.append(self._cur_params.copy())
        self.policy.set_param_values(self._cur_params)

        return avg_return
Пример #2
0
    def _process_samples(self, episodes):
        """Process sample data based on the collected paths.

        Args:
            episodes (EpisodeBatch): Collected batch of episodes.

        Returns:
            _MAMLEpisodeBatch: Processed samples data.

        """
        paths = episodes.to_list()
        for path in paths:
            path['returns'] = discount_cumsum(
                path['rewards'], self._inner_algo.discount).copy()

        self._train_value_function(paths)

        obs = torch.Tensor(episodes.padded_observations)
        actions = torch.Tensor(episodes.padded_actions)
        rewards = torch.Tensor(episodes.padded_rewards)
        valids = torch.Tensor(episodes.lengths).int()
        with torch.no_grad():
            # pylint: disable=protected-access
            baselines = self._inner_algo._value_function(obs)

        return _MAMLEpisodeBatch(paths, obs, actions, rewards, valids,
                                 baselines)
Пример #3
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each
                task. In RL^2, there are n environments/tasks and paths in
                each of them will be concatenated at some point and fed to
                the policy.

        Returns:
            EpisodeBatch: Processed batch of episodes for feeding the inner
                algorithm.
            numpy.float64: The average return.

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = discount_cumsum(path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use garage.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for _paths in paths_by_task.values():
            concatenated_path = self._concatenate_paths(_paths)
            concatenated_paths.append(concatenated_path)

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0]._env, 'all_task_names'):
            names = [
                env._env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr,
            EpisodeBatch.from_list(self._env_spec, paths),
            self._inner_algo._discount,
            name_map=name_map)

        average_return = np.mean(undiscounted_returns)
        episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths)

        return episodes, average_return
Пример #4
0
    def _compute_meta_loss(self, all_samples, all_params, set_grad=True):
        """Compute loss to meta-optimize.

        Args:
            all_samples (list[list[_MAMLEpisodeBatch]]): A two
                dimensional list of _MAMLEpisodeBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            all_params (list[dict]): A list of named parameter dictionaries.
                Each dictionary contains key value pair of names (str) and
                parameters (torch.Tensor).
            set_grad (bool): Whether to enable gradient calculation or not.

        Returns:
            torch.Tensor: Calculated mean value of loss.

        """
        theta = dict(self._policy.named_parameters())
        old_theta = dict(self._old_policy.named_parameters())

        losses = []
        for task_samples, task_params in zip(all_samples, all_params):
            with torch.set_grad_enabled(set_grad):
                # SG-MRL specific
                # pylint: disable=protected-access
                initial_samples = task_samples[0]
                init_log_probs = self._inner_algo._compute_log_probs(*initial_samples[1:])

            for i in range(self._num_grad_updates):
                require_grad = i < self._num_grad_updates - 1 or set_grad
                self._adapt(task_samples[i], set_grad=require_grad)

            update_module_params(self._old_policy, task_params)
            with torch.set_grad_enabled(set_grad):
                # pylint: disable=protected-access
                last_update = task_samples[-1]
                loss = self._inner_algo._compute_loss(*last_update[1:])

            # SG-MRL specific
            with torch.set_grad_enabled(False):
                adapted_reward = last_update.rewards.detach().clone().numpy()  # note that we treat it as a constant
                j_tilde = np.mean([discount_cumsum(path, self._inner_algo.discount)[0] for path in adapted_reward])

            # SG-MRL specific
            loss += j_tilde * init_log_probs

            losses.append(loss)

            update_module_params(self._policy, theta)
            update_module_params(self._old_policy, old_theta)

        return torch.stack(losses).mean()
Пример #5
0
    def _process_samples(self, paths):
        """Process sample data based on the collected paths.

        Args:
            paths (list[dict]): A list of collected paths.

        Returns:
            _MAMLEpisodeBatch: Processed samples data.

        """
        for path in paths:
            path['returns'] = discount_cumsum(
                path['rewards'], self._inner_algo.discount).copy()

        self._train_value_function(paths)
        obs, actions, rewards, _, valids, baselines = self._inner_algo._process_samples(  # pylint: disable=protected-access # noqa: E501
            paths)
        return _MAMLEpisodeBatch(paths, obs, actions, rewards, valids,
                                 baselines)
Пример #6
0
def log_performance(itr, batch, discount, prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of episodes.

    Args:
        itr (int): Iteration number.
        batch (EpisodeBatch): The episodes to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    termination = []
    success = []
    for eps in batch.split():
        returns.append(discount_cumsum(eps.rewards, discount))
        undiscounted_returns.append(sum(eps.rewards))
        termination.append(
            float(
                any(step_type == StepType.TERMINAL
                    for step_type in eps.step_types)))
        if 'success' in eps.env_infos:
            success.append(float(eps.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumEpisodes', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('TerminationRate', np.mean(termination))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
Пример #7
0
    def _process_samples(self, paths):
        r"""Process sample data based on the collected paths.

        Notes: P is the maximum episode length (self.max_episode_length)

        Args:
            paths (list[dict]): A list of collected paths

        Returns:
            torch.Tensor: The observations of the environment
                with shape :math:`(N, P, O*)`.
            torch.Tensor: The actions fed to the environment
                with shape :math:`(N, P, A*)`.
            torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
            list[int]: Numbers of valid steps in each paths.
            torch.Tensor: Value function estimation at each step
                with shape :math:`(N, P)`.

        """
        valids = torch.Tensor([len(path['actions']) for path in paths]).int()
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_episode_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_episode_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_episode_length)
            for path in paths
        ])
        returns = torch.stack([
            pad_to_last(discount_cumsum(path['rewards'], self.discount).copy(),
                        total_length=self.max_episode_length) for path in paths
        ])
        with torch.no_grad():
            baselines = self._value_function(obs)

        return obs, actions, rewards, returns, valids, baselines
Пример #8
0
    def _train_once(self, samples):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            samples (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        losses = []
        self._policy_opt.zero_grad()
        for path in samples:
            returns_numpy = discount_cumsum(path['rewards'], self._discount)
            returns = torch.Tensor(returns_numpy.copy())
            obs = torch.Tensor(path['observations'])
            actions = torch.Tensor(path['actions'])
            dist = self.policy(obs)[0]
            log_likelihoods = dist.log_prob(actions)
            loss = (-log_likelihoods * returns).mean()
            loss.backward()
            losses.append(loss.item())
        self._policy_opt.step()
        return np.mean(losses)
Пример #9
0
    def _train_once(self, samples):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            samples (list[dict]): A list of collected samples.

        Returns:
            numpy.float64: Average return.

        """
        obs = np.concatenate([path['observations'] for path in samples])
        actions = np.concatenate([path['actions'] for path in samples])
        returns = []
        for path in samples:
            returns.append(discount_cumsum(path['rewards'], self._discount))
        returns = np.concatenate(returns)
        sess = tf.compat.v1.get_default_session()
        sess.run(self._train_op,
                 feed_dict={
                     self._observation: obs,
                     self._action: actions,
                     self._returns: returns,
                 })
        return np.mean(returns)
Пример #10
0
def log_performance(itr, batch, discount, prefix="Evaluation", use_wandb=True):
    """Evaluate the performance of an algorithm on a batch of episodes.

    Args:
        itr (int): Iteration number.
        batch (EpisodeBatch): The episodes to evaluate with.
        discount (float): Discount value, from algorithm"s property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    termination = []
    success = []
    rewards = []
    grasp_success = []
    near_object = []
    episode_mean_grasp_reward = []
    episode_max_grasp_reward = []
    episode_min_grasp_reward = []
    episode_mean_in_place_reward = []
    episode_max_in_place_reward = []
    episode_min_in_place_reward = []
    for eps in batch.split():
        rewards.append(eps.rewards)
        returns.append(discount_cumsum(eps.rewards, discount))
        undiscounted_returns.append(sum(eps.rewards))
        termination.append(
            float(
                any(step_type == StepType.TERMINAL
                    for step_type in eps.step_types)))
        if "success" in eps.env_infos:
            success.append(float(eps.env_infos["success"].any()))
        if "grasp_success" in eps.env_infos:
            grasp_success.append(float(eps.env_infos["grasp_success"].any()))
        if "near_object" in eps.env_infos:
            near_object.append(float(eps.env_infos["near_object"].any()))
        if "grasp_reward" in eps.env_infos:
            episode_mean_grasp_reward.append(
                np.mean(eps.env_infos["grasp_reward"]))
            episode_max_grasp_reward.append(max(eps.env_infos["grasp_reward"]))
            episode_min_grasp_reward.append(min(eps.env_infos["grasp_reward"]))
        if "in_place_reward" in eps.env_infos:
            episode_mean_in_place_reward.append(
                np.mean(eps.env_infos["in_place_reward"]))
            episode_max_in_place_reward.append(
                max(eps.env_infos["in_place_reward"]))
            episode_min_in_place_reward.append(
                min(eps.env_infos["in_place_reward"]))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + "/"):
        tabular.record("Iteration", itr)
        tabular.record("NumEpisodes", len(returns))
        tabular.record("MinReward", np.min(rewards))
        tabular.record("MaxReward", np.max(rewards))
        tabular.record("AverageDiscountedReturn", average_discounted_return)
        tabular.record("AverageReturn", np.mean(undiscounted_returns))
        tabular.record("StdReturn", np.std(undiscounted_returns))
        tabular.record("MaxReturn", np.max(undiscounted_returns))
        tabular.record("MinReturn", np.min(undiscounted_returns))
        tabular.record("TerminationRate", np.mean(termination))
        if success:
            tabular.record("SuccessRate", np.mean(success))
        if grasp_success:
            tabular.record("GraspSuccessRate", np.mean(grasp_success))
        if near_object:
            tabular.record("NearObject", np.mean(near_object))
        if episode_mean_grasp_reward:
            tabular.record("EpisodeMeanGraspReward",
                           np.mean(episode_mean_grasp_reward))
            tabular.record("EpisodeMeanMaxGraspReward",
                           np.mean(episode_max_grasp_reward))
            tabular.record("EpisodeMeanMinGraspReward",
                           np.mean(episode_min_grasp_reward))
        if episode_mean_in_place_reward:
            tabular.record("EpisodeMeanInPlaceReward",
                           np.mean(episode_mean_in_place_reward))
            tabular.record("EpisodeMeanMaxInPlaceReward",
                           np.mean(episode_max_in_place_reward))
            tabular.record("EpisodeMeanMinInPlaceReward",
                           np.mean(episode_min_in_place_reward))

    log_dict = None
    if use_wandb:
        log_dict = {}
        log_dict[prefix + "/Iteration"] = itr
        log_dict[prefix + "/NumEpisodes"] = len(returns)
        log_dict[prefix + "/MinReward"] = np.min(rewards)
        log_dict[prefix + "/MaxReward"] = np.max(rewards)
        log_dict[prefix + "/AverageDiscountedReturn"] = average_discounted_return
        log_dict[prefix + "AverageReturn"] = np.mean(undiscounted_returns)
        log_dict[prefix + "/StdReturn"] = np.std(undiscounted_returns)
        log_dict[prefix + "/MaxReturn"] = np.max(undiscounted_returns)
        log_dict[prefix + "/MinReturn"] = np.min(undiscounted_returns)
        log_dict[prefix + "/TerminationRate"] = np.mean(termination)

        if success:
            log_dict[prefix + "/SuccessRate"] = np.mean(success)
        if grasp_success:
            log_dict[prefix + "Misc/GraspSuccessRate"] = np.mean(grasp_success)
        if near_object:
            log_dict[prefix + "Misc/NearObject"] = np.mean(near_object)
        if episode_mean_grasp_reward:
            log_dict[prefix + "Misc/EpisodeMeanGraspReward"] = np.mean(episode_mean_grasp_reward)
            log_dict[prefix + "Misc/EpisodeMeanMaxGraspReward"] = np.mean(episode_max_grasp_reward)
            log_dict[prefix + "Misc/EpisodeMeanMinGraspReward"] = np.mean(episode_min_grasp_reward)
        if episode_mean_in_place_reward:
            log_dict[prefix + "Misc/EpisodeMeanInPlaceReward"] = np.mean(episode_mean_grasp_reward)
            log_dict[prefix + "Misc/EpisodeMeanMaxInPlaceReward"] = np.mean(episode_max_in_place_reward)
            log_dict[prefix + "Misc/EpisodeMeanMinInPlaceReward"] = np.mean(episode_min_in_place_reward)

    return undiscounted_returns, log_dict
Пример #11
0
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount,
                     gae_lambda):
    """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_episode_length (int): Maximum length of a single episode.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.
        gae_lambda (float): Lambda used for generalized advantage
            estimation.

    Returns:
        dict: Processed sample data, with key
            * observations: (numpy.ndarray)
            * actions: (numpy.ndarray)
            * rewards: (numpy.ndarray)
            * baselines: (numpy.ndarray)
            * returns: (numpy.ndarray)
            * valids: (numpy.ndarray)
            * agent_infos: (dict)
            * env_infos: (dict)
            * paths: (list[dict])

    """
    baselines = []
    returns = []
    total_steps = 0

    for idx, path in enumerate(paths):
        total_steps += len(path['rewards'])
        path_baselines = np.append(baseline_predictions[idx], 0)
        deltas = (path['rewards'] + discount * path_baselines[1:] -
                  path_baselines[:-1])
        path['advantages'] = discount_cumsum(deltas, discount * gae_lambda)
        path['deltas'] = deltas

    for idx, path in enumerate(paths):
        # baselines
        path['baselines'] = baseline_predictions[idx]
        baselines.append(path['baselines'])

        # returns
        path['returns'] = discount_cumsum(path['rewards'], discount)
        returns.append(path['returns'])

    # make all paths the same length
    obs = [path['observations'] for path in paths]
    obs = tensor_utils.pad_tensor_n(obs, max_episode_length)

    actions = [path['actions'] for path in paths]
    actions = tensor_utils.pad_tensor_n(actions, max_episode_length)

    rewards = [path['rewards'] for path in paths]
    rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length)

    returns = [path['returns'] for path in paths]
    returns = tensor_utils.pad_tensor_n(returns, max_episode_length)

    baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length)

    agent_infos = [path['agent_infos'] for path in paths]
    agent_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length)
        for p in agent_infos
    ])

    env_infos = [path['env_infos'] for path in paths]
    env_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos
    ])

    valids = [np.ones_like(path['returns']) for path in paths]
    valids = tensor_utils.pad_tensor_n(valids, max_episode_length)

    lengths = np.asarray([v.sum() for v in valids])

    samples_data = dict(
        observations=obs,
        actions=actions,
        rewards=rewards,
        baselines=baselines,
        returns=returns,
        valids=valids,
        lengths=lengths,
        agent_infos=agent_infos,
        env_infos=env_infos,
        paths=paths,
    )

    return samples_data
Пример #12
0
    def _evaluate(self, policy_opt_input_values, episodes, baselines,
                  embed_ep_infos):
        """Evaluate rewards and everything else.

        Args:
            policy_opt_input_values (list[np.ndarray]): Flattened
                policy optimization input values.
            episodes (EpisodeBatch): Batch of episodes.
            baselines (np.ndarray): Baseline predictions.
            embed_ep_infos (dict): Embedding distribution information.

        Returns:
            dict: Paths for fitting the baseline.

        """
        # pylint: disable=too-many-statements
        fit_paths = []
        valids = episodes.valids
        observations = episodes.padded_observations
        tasks = pad_batch_array(episodes.env_infos['task_onehot'],
                                episodes.lengths, self.max_episode_length)
        latents = pad_batch_array(episodes.agent_infos['latent'],
                                  episodes.lengths, self.max_episode_length)
        baselines_list = []
        for baseline, valid in zip(baselines, valids):
            baselines_list.append(baseline[valid.astype(np.bool)])

        # Augment reward from baselines
        rewards_tensor = self._f_rewards(*policy_opt_input_values)
        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        env_rewards = episodes.rewards
        env_returns = [
            discount_cumsum(rwd, self._discount)
            for rwd in episodes.padded_rewards
        ]
        env_average_discounted_return = np.mean(
            [ret[0] for ret in env_returns])

        # Recompute returns and prepare paths for fitting the baseline
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, task, latent, obs in zip(rewards_tensor,
                                                    returns_tensor, valids,
                                                    tasks, latents,
                                                    observations):
            returns = ret[val.astype(np.bool)]
            task = task[val.astype(np.bool)]
            latent = latent[val.astype(np.bool)]
            obs = obs[val.astype(np.bool)]

            aug_rewards.append(rew[val.astype(np.bool)])
            aug_returns.append(returns)
            fit_paths.append(
                dict(observations=obs,
                     tasks=task,
                     latents=latent,
                     returns=returns))
        aug_rewards = concat_tensor_list(aug_rewards)
        aug_returns = concat_tensor_list(aug_returns)

        # Calculate effect of the entropy terms
        d_rewards = np.mean(aug_rewards - env_rewards)
        tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards)

        aug_average_discounted_return = (np.mean(
            [ret[0] for ret in returns_tensor]))
        d_returns = np.mean(aug_average_discounted_return -
                            env_average_discounted_return)
        tabular.record('{}/EntReturns'.format(self.policy.name), d_returns)

        # Calculate explained variance
        ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns)
        tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev)

        inference_rmse = (embed_ep_infos['mean'] - latents)**2.
        inference_rmse = np.sqrt(inference_rmse.mean())
        tabular.record('Inference/RMSE', inference_rmse)

        inference_rrse = rrse(latents, embed_ep_infos['mean'])
        tabular.record('Inference/RRSE', inference_rrse)

        embed_ent = self._f_encoder_entropy(*policy_opt_input_values)
        tabular.record('{}/Encoder/Entropy'.format(self.policy.name),
                       embed_ent)

        infer_ce = self._f_inference_ce(*policy_opt_input_values)
        tabular.record('Inference/CrossEntropy', infer_ce)

        pol_ent = self._f_policy_entropy(*policy_opt_input_values)
        pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths)
        tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)

        task_ents = self._f_task_entropies(*policy_opt_input_values)
        tasks = tasks[:, 0, :]
        _, task_indices = np.nonzero(tasks)
        path_lengths = np.sum(valids, axis=1)
        for t in range(self.policy.task_space.flat_dim):
            lengths = path_lengths[task_indices == t]
            completed = lengths < self.max_episode_length
            pct_completed = np.mean(completed)
            tabular.record('Tasks/EpisodeLength/t={}'.format(t),
                           np.mean(lengths))
            tabular.record('Tasks/TerminationRate/t={}'.format(t),
                           pct_completed)
            tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t])

        return fit_paths
Пример #13
0
    def _train_once(self, itr, eps):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            eps (EpisodeBatch): A batch of collected paths.

        Returns:
            numpy.float64: Calculated mean value of undiscounted returns.

        """
        obs = torch.Tensor(eps.padded_observations)
        rewards = torch.Tensor(eps.padded_rewards)
        returns = torch.Tensor(
            np.stack([
                discount_cumsum(reward, self.discount)
                for reward in eps.padded_rewards
            ]))
        valids = eps.lengths
        with torch.no_grad():
            baselines = self._value_function(obs)

        if self._maximum_entropy:
            policy_entropies = self._compute_policy_entropy(obs)
            rewards += self._policy_ent_coeff * policy_entropies

        obs_flat = torch.Tensor(eps.observations)
        actions_flat = torch.Tensor(eps.actions)
        rewards_flat = torch.Tensor(eps.rewards)
        returns_flat = torch.cat(filter_valids(returns, valids))
        advs_flat = self._compute_advantage(rewards, valids, baselines)

        with torch.no_grad():
            policy_loss_before = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_before = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_before = self._compute_kl_constraint(obs)

        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
                    advs_flat)

        with torch.no_grad():
            policy_loss_after = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_after = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_after = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        with tabular.prefix(self.policy.name):
            tabular.record('/LossBefore', policy_loss_before.item())
            tabular.record('/LossAfter', policy_loss_after.item())
            tabular.record('/dLoss',
                           (policy_loss_before - policy_loss_after).item())
            tabular.record('/KLBefore', kl_before.item())
            tabular.record('/KL', kl_after.item())
            tabular.record('/Entropy', policy_entropy.mean().item())

        with tabular.prefix(self._value_function.name):
            tabular.record('/LossBefore', vf_loss_before.item())
            tabular.record('/LossAfter', vf_loss_after.item())
            tabular.record('/dLoss',
                           vf_loss_before.item() - vf_loss_after.item())

        self._old_policy.load_state_dict(self.policy.state_dict())

        undiscounted_returns = log_performance(itr,
                                               eps,
                                               discount=self._discount)
        return np.mean(undiscounted_returns)
Пример #14
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each
                task. In RL^2, there are n environments/tasks and paths in
                each of them will be concatenated at some point and fed to
                the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = discount_cumsum(path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use garage.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for _paths in paths_by_task.values():
            concatenated_path = self._concatenate_paths(_paths)
            concatenated_paths.append(concatenated_path)

        # stack and pad to max path length of the concatenated
        # path, which will be fed to inner algo
        # i.e. max_episode_length * episode_per_task
        concatenated_paths_stacked = (stack_and_pad_tensor_dict_list(
            concatenated_paths, self._inner_algo.max_episode_length))

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0]._env, 'all_task_names'):
            names = [
                env._env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr,
            EpisodeBatch.from_list(self._env_spec, paths),
            self._inner_algo._discount,
            name_map=name_map)

        concatenated_paths_stacked['paths'] = concatenated_paths
        concatenated_paths_stacked['average_return'] = np.mean(
            undiscounted_returns)

        return concatenated_paths_stacked