예제 #1
0
    def process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            tuple:
                * obs (torch.Tensor): The observations of the environment.
                * actions (torch.Tensor): The actions fed to the environment.
                * rewards (torch.Tensor): The acquired rewards.
                * valids (list[int]): Numbers of valid steps in each paths.
                * baselines (torch.Tensor): Value function estimation
                    at each step.

        """
        for path in paths:
            if 'returns' not in path:
                path['returns'] = tensor_utils.discount_cumsum(
                    path['rewards'], self.discount)

        returns = torch.stack([
            pad_to_last(tensor_utils.discount_cumsum(path['rewards'],
                                                     self.discount).copy(),
                        total_length=self.max_path_length) for path in paths
        ])
        valids = torch.Tensor([len(path['actions']) for path in paths]).int()
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        avail_actions = torch.stack([
            pad_one_to_last(path['avail_actions'],
                            total_length=self.max_path_length,
                            axis=0) for path in paths
        ])  # Cannot pad all zero since prob sum cannot be zero
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])

        if isinstance(self.baseline, LinearFeatureBaseline):
            baselines = torch.stack([
                pad_to_last(self._get_baselines(path),
                            total_length=self.max_path_length)
                for path in paths
            ])
        else:
            with torch.no_grad():
                baselines = self.baseline.forward(obs)

        return obs, avail_actions, actions, rewards, valids, baselines, returns
예제 #2
0
파일: diayn.py 프로젝트: fangqyi/garage
    def _log_performance(self, itr, batch, discount, prefix='Evaluation'):
        self_returns = []
        env_returns = []
        undiscounted_self_returns = []
        undiscounted_env_returns = []
        completion = []
        success = []
        for trajectory in batch.split():
            self_returns.append(
                discount_cumsum(trajectory.self_rewards, discount))
            env_returns.append(
                discount_cumsum(trajectory.env_rewards, discount))
            undiscounted_self_returns.append(sum(trajectory.self_rewards))
            undiscounted_env_returns.append(sum(trajectory.env_rewards))
            completion.append(float(trajectory.terminals.any()))
            if 'success' in trajectory.env_infos:
                success.append(float(trajectory.env_infos['success'].any()))

        average_discounted_self_return = np.mean(
            [rtn[0] for rtn in self_returns])
        average_discounted_env_return = np.mean(
            [rtn[0] for rtn in env_returns])

        with tabular.prefix(prefix + '/'):
            tabular.record('Iteration', itr)
            tabular.record('NumTrajs', len(self_returns))
            # pseudo reward
            tabular.record('AverageDiscountedSelfReturn',
                           average_discounted_self_return)
            tabular.record('AverageSelfReturn',
                           np.mean(undiscounted_self_returns))
            tabular.record('StdSelfReturn', np.std(undiscounted_self_returns))
            tabular.record('MaxSelfReturn', np.max(undiscounted_self_returns))
            tabular.record('MinSelfReturn', np.min(undiscounted_self_returns))
            # env reward
            tabular.record('AverageDiscountedEnvReturn',
                           average_discounted_env_return)
            tabular.record('AverageEnvReturn',
                           np.mean(undiscounted_env_returns))
            tabular.record('StdEnvReturn', np.std(undiscounted_env_returns))
            tabular.record('MaxEnvReturn', np.max(undiscounted_env_returns))
            tabular.record('MinEnvReturn', np.min(undiscounted_env_returns))

            tabular.record('CompletionRate', np.mean(completion))
            if success:
                tabular.record('SuccessRate', np.mean(success))

        return undiscounted_self_returns, undiscounted_env_returns
예제 #3
0
def log_performance(itr,
                    batch,
                    discount,
                    trajectory_class=TrajectoryBatch,
                    prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of trajectories.

    Args:
        itr (int): Iteration number.
        batch (TrajectoryBatch): The trajectories to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    completion = []
    success = []
    for trajectory in batch.split():
        if trajectory_class == TrajectoryBatch:
            returns.append(discount_cumsum(trajectory.rewards, discount))
            undiscounted_returns.append(sum(trajectory.rewards))
        else:
            returns.append(discount_cumsum(trajectory.env_rewards, discount))
            undiscounted_returns.append(sum(trajectory.env_rewards))
        completion.append(float(trajectory.terminals.any()))
        if 'success' in trajectory.env_infos:
            success.append(float(trajectory.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumTrajs', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('CompletionRate', np.mean(completion))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
예제 #4
0
    def _train_once(self, epoch, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            epoch (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return of epoch cycle.

        """
        returns = []
        for path in paths:
            returns.append(
                tensor_utils.discount_cumsum(path['rewards'], self._discount))
        avg_return = np.mean(np.concatenate(returns))
        self._all_avg_returns.append(avg_return)
        if (epoch + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_avg_returns)
            best_inds = np.argsort(-avg_rtns)[:self._n_best]
            best_params = np.array(self._all_params)[best_inds]
            self._cur_mean = best_params.mean(axis=0)
            self._cur_std = best_params.std(axis=0)
            self.policy.set_param_values(self._cur_mean)
            avg_return = max(self._all_avg_returns)
            self._all_avg_returns.clear()
            self._all_params.clear()
        self._cur_params = self._sample_params(epoch)
        self._all_params.append(self._cur_params.copy())
        self.policy.set_param_values(self._cur_params)
        return avg_return
예제 #5
0
    def _train_once(self, samples):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            samples (list[dict]): A list of collected samples.

        Returns:
            numpy.float64: Average return.

        """
        obs = np.concatenate([path['observations'] for path in samples])
        actions = np.concatenate([path['actions'] for path in samples])
        returns = []
        for path in samples:
            returns.append(
                tensor_utils.discount_cumsum(path['rewards'], self._discount))
        returns = np.concatenate(returns)
        sess = tf.compat.v1.get_default_session()
        sess.run(self._train_op,
                 feed_dict={
                     self._observation: obs,
                     self._action: actions,
                     self._returns: returns,
                 })
        return np.mean(returns)
예제 #6
0
파일: MBPG_HA.py 프로젝트: gaosh/MBPG
    def process_samples(self, itr, paths):
        """Process sample data based on the collected paths.
        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths
        Returns:
            dict: Processed sample data, with key
                * average_return: (float)
        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)

        valids = [len(path['actions']) for path in paths]
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])

        #print(valids)

        return valids, obs, actions, rewards
예제 #7
0
    def _train_once(self, samples):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            samples (list[dict]): A list of collected paths.

        Returns:
            numpy.float64: Average return.

        """
        losses = []
        self._policy_opt.zero_grad()
        for path in samples:
            returns_numpy = tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            returns = torch.Tensor(returns_numpy.copy())
            obs = torch.Tensor(path['observations'])
            actions = torch.Tensor(path['actions'])
            dist = self.policy(obs)[0]
            log_likelihoods = dist.log_prob(actions)
            loss = (-log_likelihoods * returns).mean()
            loss.backward()
            losses.append(loss.item())
        self._policy_opt.step()
        return np.mean(losses)
예제 #8
0
파일: maml.py 프로젝트: andCelli/garage
    def evaluate_performance(self, itr, all_samples, loss_before, loss_after,
                             kl_before, kl, policy_entropy):
        """Evaluate performance of this batch.

        Args:
            itr (int): Iteration number.
            all_samples (list[list[MAMLTrajectoryBatch]]): Two
                dimensional list of MAMLTrajectoryBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        tabular.record('Iteration', itr)

        for i in range(self._num_grad_updates + 1):
            all_rewards = [
                path_rewards for task_samples in all_samples
                for path_rewards in task_samples[i].rewards.numpy()
            ]

            discounted_returns = [
                tensor_utils.discount_cumsum(path_rewards,
                                             self._inner_algo.discount)[0]
                for path_rewards in all_rewards
            ]
            undiscounted_returns = np.sum(all_rewards, axis=-1)
            average_return = np.mean(undiscounted_returns)

            with tabular.prefix('Update_{0}/'.format(i)):
                tabular.record('AverageDiscountedReturn',
                               np.mean(discounted_returns))
                tabular.record('AverageReturn', average_return)
                tabular.record('StdReturn', np.std(undiscounted_returns))
                tabular.record('MaxReturn', np.max(undiscounted_returns))
                tabular.record('MinReturn', np.min(undiscounted_returns))
                tabular.record('NumTrajs', len(all_rewards))

        with tabular.prefix(self._policy.name + '/'):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KLAfter', kl)
            tabular.record('Entropy', policy_entropy)

        return average_return
예제 #9
0
파일: base.py 프로젝트: wjssx/garage
    def evaluate_performance(self, itr, batch):
        # pylint: disable=no-self-use
        r"""Evaluate the performance of the algorithm.

        Args:
            itr (int): Iteration number.
            batch (dict): Evaluation trajectories, representing
                the best current performance of the algorithm, with keys:
                * env_spec (garage.envs.EnvSpec): Specification for the
                environment from which this data was sampled.
                * observations (numpy.ndarray): A numpy array containing the
                    observations for all time steps in this batch.
                * actions (numpy.ndarray): A  numpy array containing the
                    actions for all time steps in this batch.
                * rewards (numpy.ndarray): A numpy array containing the
                    rewards for all time steps in this batch.
                * terminals (numpy.ndarray): A boolean numpy array
                    containing the termination signals for all time steps
                    in this batch.
                * env_infos (dict): A dict of numpy arrays arbitrary
                    environment state information.
                * agent_infos (numpy.ndarray): A dict of numpy arrays
                    arbitrary agent state information.
                * lengths (numpy.ndarray): An integer numpy array
                    containing the length of each trajectory in this batch.
                * discount (float): Discount value, from algorithm's property.

        Returns:
            numpy.ndarray: Undiscounted returns.

        """
        returns = []
        for reward in batch['rewards']:
            rtn = np_tensor_utils.discount_cumsum(reward, batch['discount'])
            returns.append(rtn)

        average_discounted_return = np.mean([rtn[0] for rtn in returns])

        undiscounted_returns = [sum(reward) for reward in batch['rewards']]

        tabular.record('Iteration', itr)
        tabular.record('Evaluation/NumTrajs', len(returns))

        tabular.record('Evaluation/AverageDiscountedReturn',
                       average_discounted_return)
        tabular.record('Evaluation/AverageReturn',
                       np.mean(undiscounted_returns))
        tabular.record('Evaluation/StdReturn', np.std(undiscounted_returns))
        tabular.record('Evaluation/MaxReturn', np.max(undiscounted_returns))
        tabular.record('Evaluation/MinReturn', np.min(undiscounted_returns))

        return undiscounted_returns
예제 #10
0
    def process_samples(self, itr, paths):
        r"""Process sample data based on the collected paths.

        Notes: P is the maximum path length (self.max_path_length)

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            torch.Tensor: The observations of the environment
                with shape :math:`(N, P, O*)`.
            torch.Tensor: The actions fed to the environment
                with shape :math:`(N, P, A*)`.
            torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
            list[int]: Numbers of valid steps in each paths.
            torch.Tensor: Value function estimation at each step
                with shape :math:`(N, P)`.

        """
        for path in paths:
            if 'returns' not in path:
                path['returns'] = tu.discount_cumsum(path['rewards'],
                                                     self.discount)

        valids = torch.Tensor([len(path['actions']) for path in paths]).int()
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])
        baselines = torch.stack([
            pad_to_last(self._get_baselines(path),
                        total_length=self.max_path_length) for path in paths
        ])

        return obs, actions, rewards, valids, baselines
예제 #11
0
    def process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            tuple:
                * obs (torch.Tensor): The observations of the environment.
                * actions (torch.Tensor): The actions fed to the environment.
                * rewards (torch.Tensor): The acquired rewards.
                * valids (list[int]): Numbers of valid steps in each paths.
                * baselines (torch.Tensor): Value function estimation
                    at each step.

        """
        for path in paths:
            if 'returns' not in path:
                path['returns'] = tensor_utils.discount_cumsum(
                    path['rewards'], self.discount)

        valids = [len(path['actions']) for path in paths]
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_path_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_path_length)
            for path in paths
        ])
        baselines = torch.stack([
            pad_to_last(self._get_baselines(path),
                        total_length=self.max_path_length) for path in paths
        ])

        return obs, actions, rewards, valids, baselines
예제 #12
0
파일: maml.py 프로젝트: andCelli/garage
    def _process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            MAMLTrajectoryBatch: Processed samples data.

        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self._inner_algo.discount)

        self._baseline.fit(paths)
        obs, actions, rewards, valids, baselines \
            = self._inner_algo.process_samples(itr, paths)
        return MAMLTrajectoryBatch(obs, actions, rewards, valids, baselines)
예제 #13
0
def log_performance(itr, batch, discount, prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of episodes.

    Args:
        itr (int): Iteration number.
        batch (EpisodeBatch): The episodes to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    termination = []
    success = []
    for eps in batch.split():
        returns.append(discount_cumsum(eps.rewards, discount))
        undiscounted_returns.append(sum(eps.rewards))
        termination.append(
            float(
                any(step_type == StepType.TERMINAL
                    for step_type in eps.step_types)))
        if 'success' in eps.env_infos:
            success.append(float(eps.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumEpisodes', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('TerminationRate', np.mean(termination))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
예제 #14
0
    def process_samples(self, paths):
        r"""Process sample data based on the collected paths.

        Notes: P is the maximum episode length (self.max_episode_length)

        Args:
            paths (list[dict]): A list of collected paths

        Returns:
            torch.Tensor: The observations of the environment
                with shape :math:`(N, P, O*)`.
            torch.Tensor: The actions fed to the environment
                with shape :math:`(N, P, A*)`.
            torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
            list[int]: Numbers of valid steps in each paths.
            torch.Tensor: Value function estimation at each step
                with shape :math:`(N, P)`.

        """
        valids = torch.Tensor([len(path['actions']) for path in paths]).int()
        obs = torch.stack([
            pad_to_last(path['observations'],
                        total_length=self.max_episode_length,
                        axis=0) for path in paths
        ])
        actions = torch.stack([
            pad_to_last(path['actions'],
                        total_length=self.max_episode_length,
                        axis=0) for path in paths
        ])
        rewards = torch.stack([
            pad_to_last(path['rewards'], total_length=self.max_episode_length)
            for path in paths
        ])
        returns = torch.stack([
            pad_to_last(tu.discount_cumsum(path['rewards'],
                                           self.discount).copy(),
                        total_length=self.max_episode_length) for path in paths
        ])
        with torch.no_grad():
            baselines = self._value_function(obs)

        return obs, actions, rewards, returns, valids, baselines
예제 #15
0
    def _process_samples(self, paths):
        """Process sample data based on the collected paths.

        Args:
            paths (list[dict]): A list of collected paths.

        Returns:
            MAMLTrajectoryBatch: Processed samples data.

        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self._inner_algo.discount).copy()

        self._train_value_function(paths)
        obs, actions, rewards, _, valids, baselines \
            = self._inner_algo.process_samples(paths)
        return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids,
                                   baselines)
예제 #16
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data
예제 #17
0
파일: meh_wat.py 프로젝트: pything/agent
    def traj_list_to_tensors(paths, max_path_length, baseline_predictions,
                             discount):
        """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_path_length (int): Maximum length of a single rollout.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.

    Returns:
        dict: Processed sample data, with key
            * observations (numpy.ndarray): Padded array of the observations of
                the environment
            * actions (numpy.ndarray): Padded array of the actions fed to the
                the environment
            * rewards (numpy.ndarray): Padded array of the acquired rewards
            * agent_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * env_infos (dict): a dictionary of {stacked tensors or
                dictionary of stacked tensors}
            * rewards (numpy.ndarray): Padded array of the validity information


    """
        baselines = []
        returns = []

        for idx, path in enumerate(paths):
            # baselines
            path["baselines"] = baseline_predictions[idx]
            baselines.append(path["baselines"])

            # returns
            path["returns"] = tensor_utils.discount_cumsum(
                path["rewards"], discount)
            returns.append(path["returns"])

        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path["rewards"] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path["agent_infos"] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path["env_infos"] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path["returns"]) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            agent_infos=agent_infos,
            env_infos=env_infos,
            valids=valids,
        )

        return samples_data
예제 #18
0
    def process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []
        total_steps = 0

        max_path_length = self.max_path_length

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            total_steps += len(path['rewards'])
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            lengths=lengths,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
예제 #19
0
    def _process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (OrderedDict[dict]): A list of collected paths for each
                task. In RL^2, there are n environments/tasks and paths in
                each of them will be concatenated at some point and fed to
                the policy.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        Raises:
            ValueError: If 'batch_idx' is not found.

        """
        concatenated_paths = []

        paths_by_task = collections.defaultdict(list)
        for path in paths:
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            path['lengths'] = [len(path['rewards'])]
            if 'batch_idx' in path:
                paths_by_task[path['batch_idx']].append(path)
            elif 'batch_idx' in path['agent_infos']:
                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
            else:
                raise ValueError(
                    'Batch idx is required for RL2 but not found, '
                    'Make sure to use garage.tf.algos.rl2.RL2Worker '
                    'for sampling')

        # all path in paths_by_task[i] are sampled from task[i]
        for _paths in paths_by_task.values():
            concatenated_path = self._concatenate_paths(_paths)
            concatenated_paths.append(concatenated_path)

        # stack and pad to max path length of the concatenated
        # path, which will be fed to inner algo
        # i.e. max_path_length * episode_per_task
        concatenated_paths_stacked = (
            np_tensor_utils.stack_and_pad_tensor_dict_list(
                concatenated_paths, self._inner_algo.max_path_length))

        name_map = None
        if hasattr(self._task_sampler, '_envs') and hasattr(
                self._task_sampler._envs[0].env, 'all_task_names'):
            names = [
                env.env.all_task_names[0] for env in self._task_sampler._envs
            ]
            name_map = dict(enumerate(names))

        undiscounted_returns = log_multitask_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            self._inner_algo.discount,
            name_map=name_map)

        concatenated_paths_stacked['paths'] = concatenated_paths
        concatenated_paths_stacked['average_return'] = np.mean(
            undiscounted_returns)

        return concatenated_paths_stacked
예제 #20
0
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount,
                     gae_lambda):
    """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_episode_length (int): Maximum length of a single rollout.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.
        gae_lambda (float): Lambda used for generalized advantage
            estimation.

    Returns:
        dict: Processed sample data, with key
            * observations: (numpy.ndarray)
            * actions: (numpy.ndarray)
            * rewards: (numpy.ndarray)
            * baselines: (numpy.ndarray)
            * returns: (numpy.ndarray)
            * valids: (numpy.ndarray)
            * agent_infos: (dict)
            * env_infos: (dict)
            * paths: (list[dict])

    """
    baselines = []
    returns = []
    total_steps = 0

    for idx, path in enumerate(paths):
        total_steps += len(path['rewards'])
        path_baselines = np.append(baseline_predictions[idx], 0)
        deltas = (path['rewards'] + discount * path_baselines[1:] -
                  path_baselines[:-1])
        path['advantages'] = np_tensor_utils.discount_cumsum(
            deltas, discount * gae_lambda)
        path['deltas'] = deltas

    for idx, path in enumerate(paths):
        # baselines
        path['baselines'] = baseline_predictions[idx]
        baselines.append(path['baselines'])

        # returns
        path['returns'] = np_tensor_utils.discount_cumsum(
            path['rewards'], discount)
        returns.append(path['returns'])

    # make all paths the same length
    obs = [path['observations'] for path in paths]
    obs = tensor_utils.pad_tensor_n(obs, max_episode_length)

    actions = [path['actions'] for path in paths]
    actions = tensor_utils.pad_tensor_n(actions, max_episode_length)

    rewards = [path['rewards'] for path in paths]
    rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length)

    returns = [path['returns'] for path in paths]
    returns = tensor_utils.pad_tensor_n(returns, max_episode_length)

    baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length)

    agent_infos = [path['agent_infos'] for path in paths]
    agent_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length)
        for p in agent_infos
    ])

    env_infos = [path['env_infos'] for path in paths]
    env_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos
    ])

    valids = [np.ones_like(path['returns']) for path in paths]
    valids = tensor_utils.pad_tensor_n(valids, max_episode_length)

    lengths = np.asarray([v.sum() for v in valids])

    samples_data = dict(
        observations=obs,
        actions=actions,
        rewards=rewards,
        baselines=baselines,
        returns=returns,
        valids=valids,
        lengths=lengths,
        agent_infos=agent_infos,
        env_infos=env_infos,
        paths=paths,
    )

    return samples_data
예제 #21
0
    def process_samples(self, itr, paths):
        r"""Process sample data based on the collected paths.

        Notes: P is the maximum path length (self.max_path_length)

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            torch.Tensor: The observations of the environment
                with shape :math:`(N, P, O*)`.
            torch.Tensor: The actions fed to the environment
                with shape :math:`(N, P, A*)`.
            torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
            list[int]: Numbers of valid steps in each paths.
            torch.Tensor: Value function estimation at each step
                with shape :math:`(N, P)`.

        """
        if self._kwargs is None:
            for path in paths:
                if 'returns' not in path:
                    path['returns'] = tu.discount_cumsum(path['rewards'],
                                                        self.discount)

            valids = torch.Tensor([len(path['actions']) for path in paths]).int()
            obs = torch.stack([
                pad_to_last(path['observations'],
                            total_length=self.max_path_length,
                            axis=0) for path in paths
            ])
            actions = torch.stack([
                pad_to_last(path['actions'],
                            total_length=self.max_path_length,
                            axis=0) for path in paths
            ])          
            rewards = torch.stack([
                pad_to_last(path['rewards'], total_length=self.max_path_length)
                for path in paths
            ])
            baselines = torch.stack([
                pad_to_last(self._get_baselines(path),
                            total_length=self.max_path_length) for path in paths
            ])

            return obs, actions, rewards, valids, baselines
        else:
            if not self.initialized:
                self.initialize()
            for path in paths:
                if 'returns' not in path:
                    path['returns'] = tu.discount_cumsum(path['rewards'],
                                                        self.discount)
            for  path in enumerate(paths):
                if self.mode.startswith('ours'):
                    path = path[1]
                    imgs = [img for img in path['env_infos']['imgs'] if img is not None]
                    if not hasattr(self, 'means'):
                        self.means = []
                        self.imgs = []
                        validdata = np.load(self._kwargs['modeldata'])
                        for vp in range(self.nvp):
                            context = np.array(imgs[0])
                            
                            timgs = []
                            tfeats = []
                            nvideos = validdata.shape[1]
                            for i in range(nvideos):
                                if i % 10 == 0:
                                    print('feats %f'%i)
                                skip = 1
                                input_img = ((validdata[::skip, i] + 1)*127.5).astype(np.uint8)
                                tfeat, timg = sess.run([model.translated_z, model.out],
                                        {image: [input_img, [context]*self.batch_size, np.array(imgs)]})
                                timgs.append(timg)
                                tfeats.append(tfeat)
                            self.means.append(np.mean(tfeats, axis=0))
                            meanimgs = np.mean(timgs, axis=0)
                            self.imgs.append(meanimgs)
                    costs = 0
                    for vp in range(self.nvp):
                        curimgs = np.array(imgs)
                        feats, img_trans = sess.run([model.input_z, image_trans],
                            {image: [curimgs, [curimgs[0]] * self.batch_size, curimgs]})
                        costs += np.sum((self.means[vp] - feats)**2, axis=1) + \
                            self._kwargs['scale']*np.sum((self.imgs[vp] - img_trans[0])**2, axis=(1, 2, 3))
                    for j in range(24):
                        path["rewards"][j*2+1] -= costs[j] #* (j**2)
            
            valids = torch.Tensor([len(path['actions']) for path in paths]).int()
            obs = torch.stack([
                pad_to_last(path['observations'],
                            total_length=self.max_path_length,
                            axis=0) for path in paths
            ])
            actions = torch.stack([
                pad_to_last(path['actions'],
                            total_length=self.max_path_length,
                            axis=0) for path in paths
            ])
            
            rewards = torch.stack([
                pad_to_last(path['rewards'], total_length=self.max_path_length)
                for path in paths
            ])
            baselines = torch.stack([
                pad_to_last(self._get_baselines(path),
                            total_length=self.max_path_length) for path in paths
            ])

            return obs, actions, rewards, valids, baselines
예제 #22
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Parameters
        ----------
        itr : int
            The iteration number.
        paths : list[dict]
            The collected paths from the sampler.

        Returns
        -------
        samples_data : dict
            Processed sample data with same trajectory length (padded with 0)
        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        # average_discounted_return = (np.mean(
        #     [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        # ent = np.sum(self.policy.distribution.entropy(agent_infos) *
        #              valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
예제 #23
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data
예제 #24
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        terminals = [path['dones'] for path in paths]

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        undiscounted_returns = self.evaluate_performance(
            itr,
            dict(env_spec=None,
                 observations=obs,
                 actions=actions,
                 rewards=rewards,
                 terminals=terminals,
                 env_infos=env_infos,
                 agent_infos=agent_infos,
                 lengths=lengths,
                 discount=self.discount))

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(average_return=np.mean(undiscounted_returns))

        return samples_data
예제 #25
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.
        (same as in bath_polopt without entropy and tabular recording)
        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        # average_discounted_return = (np.mean(
        #     [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        # ent = np.sum(self.policy.distribution.entropy(agent_infos) *
        #              valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
예제 #26
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data.

        Note:
            The returned samples is a dictionary with keys
                - observations: (numpy.ndarray), shape [B * (T), *obs_dims]
                - actions: (numpy.ndarray), shape [B * (T), *act_dims]
                - rewards : (numpy.ndarray), shape [B * (T), ]
                - baselines: (numpy.ndarray), shape [B * (T), ]
                - returns: (numpy.ndarray), shape [B * (T), ]
                - lengths: (numpy.ndarray), shape [P, ], i-th entry represents
                  the length of i-th path.
                - valids: (numpy.ndarray), shape [P, ], [i, j] entry is 1 if
                  the j-th sample in i-th path is valid, otherwise 0.
                - agent_infos: (dict), see
                  OnPolicyVectorizedSampler.obtain_samples()
                - env_infos: (dict), see
                  OnPolicyVectorizedSampler.obtain_samples()
                - paths: (list[dict]) The original path with observation or
                  action flattened
                - average_return: (numpy.float64)

            where B = batch size, (T) = variable-length of each trajectory,
            P = number of paths. Notice that B * T equals to the total number
            of environment steps in all trajectories.

        """
        baselines = []
        returns = []

        if self._flatten_input:
            paths = [
                dict(
                    observations=(self._env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self._env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self._env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self._baseline, 'predict_n'):
            all_path_baselines = self._baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self._baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self._discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self._discount * self._gae_lambda)
            path['deltas'] = deltas
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])
            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self._discount)
            returns.append(path['returns'])

        obs = np.concatenate([path['observations'] for path in paths])
        actions = np.concatenate([path['actions'] for path in paths])
        rewards = np.concatenate([path['rewards'] for path in paths])
        returns = np.concatenate(returns)
        baselines = np.concatenate(baselines)

        agent_infos_path = [path['agent_infos'] for path in paths]
        agent_infos = dict()
        for key in self._policy.state_info_keys:
            agent_infos[key] = np.concatenate(
                [infos[key] for infos in agent_infos_path])

        env_infos_path = [path['env_infos'] for path in paths]
        env_infos = dict()
        for key in paths[0]['env_infos'].keys():
            env_infos[key] = np.concatenate(
                [infos[key] for infos in env_infos_path])

        valids = np.asarray([np.ones_like(path['returns']) for path in paths])
        lengths = np.asarray([v.sum() for v in valids])

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self._episode_reward_mean.extend(undiscounted_returns)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            lengths=lengths,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data