示例#1
0
文件: maml.py 项目: andCelli/garage
    def evaluate_performance(self, itr, all_samples, loss_before, loss_after,
                             kl_before, kl, policy_entropy):
        """Evaluate performance of this batch.

        Args:
            itr (int): Iteration number.
            all_samples (list[list[MAMLTrajectoryBatch]]): Two
                dimensional list of MAMLTrajectoryBatch of size
                [meta_batch_size * (num_grad_updates + 1)]
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        tabular.record('Iteration', itr)

        for i in range(self._num_grad_updates + 1):
            all_rewards = [
                path_rewards for task_samples in all_samples
                for path_rewards in task_samples[i].rewards.numpy()
            ]

            discounted_returns = [
                tensor_utils.discount_cumsum(path_rewards,
                                             self._inner_algo.discount)[0]
                for path_rewards in all_rewards
            ]
            undiscounted_returns = np.sum(all_rewards, axis=-1)
            average_return = np.mean(undiscounted_returns)

            with tabular.prefix('Update_{0}/'.format(i)):
                tabular.record('AverageDiscountedReturn',
                               np.mean(discounted_returns))
                tabular.record('AverageReturn', average_return)
                tabular.record('StdReturn', np.std(undiscounted_returns))
                tabular.record('MaxReturn', np.max(undiscounted_returns))
                tabular.record('MinReturn', np.min(undiscounted_returns))
                tabular.record('NumTrajs', len(all_rewards))

        with tabular.prefix(self._policy.name + '/'):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KLAfter', kl)
            tabular.record('Entropy', policy_entropy)

        return average_return
示例#2
0
    def _train_once(self):
        """Perform one iteration of training."""
        policy_loss_list = []
        qf_loss_list = []
        contrastive_loss_list = []
        alpha_loss_list = []
        alpha_list = []
        for _ in range(self._num_steps_per_epoch):
            indices = np.random.choice(range(self._num_train_tasks),
                                       self._meta_batch_size)
            policy_loss, qf_loss, contrastive_loss, alpha_loss, alpha = self._optimize_policy(
                indices)
            policy_loss_list.append(policy_loss)
            qf_loss_list.append(qf_loss)
            contrastive_loss_list.append(contrastive_loss)
            alpha_loss_list.append(alpha_loss)
            alpha_list.append(alpha)

        with tabular.prefix('MetaTrain/Average/'):
            tabular.record('PolicyLoss',
                           np.average(np.array(policy_loss_list)))
            tabular.record('QfLoss', np.average(np.array(qf_loss_list)))
            tabular.record('ContrastiveLoss',
                           np.average(np.array(contrastive_loss_list)))
            tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list)))
            tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list)))
            tabular.record('Alpha', np.average(np.array(alpha_list)))
示例#3
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                TrajectoryBatch.concatenate(*adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                task_names=self._test_task_names)
        self._eval_itr += 1
示例#4
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        if self._test_sampler is None:
            self._test_sampler = self._sampler_class.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_path_length=self._max_path_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=self._test_task_sampler.sample(1))
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = self._trajectory_batch_class.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                self._trajectory_batch_class.concatenate(
                    *adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                trajectory_class=self._trajectory_batch_class,
                name_map=name_map)
        self._eval_itr += 1

        if self._trajectory_batch_class == TrajectoryBatch:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).rewards
        else:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).env_rewards

        return sum(rewards) / len(rewards)
示例#5
0
def log_multitask_performance(itr, batch, discount, name_map=None):
    r"""Log performance of trajectories from multiple tasks.

    Args:
        itr (int): Iteration number to be logged.
        batch (garage.TrajectoryBatch): Batch of trajectories. The trajectories
            should have either the "task_name" or "task_id" `env_infos`. If the
            "task_name" is not present, then `name_map` is required, and should
            map from task id's to task names.
        discount (float): Discount used in computing returns.
        name_map (dict[int, str] or None): Mapping from task id's to task
            names. Optional if the "task_name" environment info is present.
            Note that if provided, all tasks listed in this map will be logged,
            even if there are no trajectories present for them.

    Returns:
        numpy.ndarray: Undiscounted returns averaged across all tasks. Has
            shape :math:`(N \bullet [T])`.

    """
    traj_by_name = defaultdict(list)
    for trajectory in batch.split():
        try:
            task_name = trajectory.env_infos['task_name'][0]
        except KeyError:
            try:
                task_id = trajectory.env_infos['task_id'][0]
                task_name = name_map[task_id]
            except KeyError:
                task_name = 'Task #{}'.format(task_id)
        traj_by_name[task_name].append(trajectory)
    if name_map is None:
        task_names = traj_by_name.keys()
    else:
        task_names = name_map.values()
    for task_name in task_names:
        if task_name in traj_by_name:
            trajectories = traj_by_name[task_name]
            log_performance(itr,
                            garage.TrajectoryBatch.concatenate(*trajectories),
                            discount,
                            prefix=task_name)
        else:
            with tabular.prefix(task_name + '/'):
                tabular.record('Iteration', itr)
                tabular.record('NumTrajs', 0)
                tabular.record('AverageDiscountedReturn', np.nan)
                tabular.record('AverageReturn', np.nan)
                tabular.record('StdReturn', np.nan)
                tabular.record('MaxReturn', np.nan)
                tabular.record('MinReturn', np.nan)
                tabular.record('CompletionRate', np.nan)
                tabular.record('SuccessRate', np.nan)

    return log_performance(itr, batch, discount=discount, prefix='Average')
示例#6
0
def log_multitask_performance(itr, batch, discount, name_map=None):
    r"""Log performance of episodes from multiple tasks.

    Args:
        itr (int): Iteration number to be logged.
        batch (EpisodeBatch): Batch of episodes. The episodes should have
            either the "task_name" or "task_id" `env_infos`. If the "task_name"
            is not present, then `name_map` is required, and should map from
            task id's to task names.
        discount (float): Discount used in computing returns.
        name_map (dict[int, str] or None): Mapping from task id's to task
            names. Optional if the "task_name" environment info is present.
            Note that if provided, all tasks listed in this map will be logged,
            even if there are no episodes present for them.

    Returns:
        numpy.ndarray: Undiscounted returns averaged across all tasks. Has
            shape :math:`(N \bullet [T])`.

    """
    eps_by_name = defaultdict(list)
    for eps in batch.split():
        task_name = '__unnamed_task__'
        if 'task_name' in eps.env_infos:
            task_name = eps.env_infos['task_name'][0]
        elif 'task_id' in eps.env_infos:
            name_map = {} if name_map is None else name_map
            task_id = eps.env_infos['task_id'][0]
            task_name = name_map.get(task_id, 'Task #{}'.format(task_id))
        eps_by_name[task_name].append(eps)
    if name_map is None:
        task_names = eps_by_name.keys()
    else:
        task_names = name_map.values()
    for task_name in task_names:
        if task_name in eps_by_name:
            episodes = eps_by_name[task_name]
            log_performance(itr,
                            EpisodeBatch.concatenate(*episodes),
                            discount,
                            prefix=task_name)
        else:
            with tabular.prefix(task_name + '/'):
                tabular.record('Iteration', itr)
                tabular.record('NumEpisodes', 0)
                tabular.record('AverageDiscountedReturn', np.nan)
                tabular.record('AverageReturn', np.nan)
                tabular.record('StdReturn', np.nan)
                tabular.record('MaxReturn', np.nan)
                tabular.record('MinReturn', np.nan)
                tabular.record('TerminationRate', np.nan)
                tabular.record('SuccessRate', np.nan)

    return log_performance(itr, batch, discount=discount, prefix='Average')
示例#7
0
    def evaluate(self, algo, test_episodes_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (MetaRLAlgorithm): The algorithm to evaluate.
            test_episodes_per_task (int or None): Number of episodes per task.

        """
        if test_episodes_per_task is None:
            test_episodes_per_task = self._n_test_episodes
        adapted_episodes = []
        logger.log('Sampling for adapation and meta-testing...')
        env_updates = self._test_task_sampler.sample(self._n_test_tasks)
        if self._test_sampler is None:
            env = env_updates[0]()
            self._max_episode_length = env.spec.max_episode_length
            self._test_sampler = LocalSampler.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_episode_length=self._max_episode_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=env)
        for env_up in env_updates:
            policy = algo.get_exploration_policy()
            eps = EpisodeBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_eps)
            ])
            adapted_policy = algo.adapt_policy(policy, eps)
            adapted_eps = self._test_sampler.obtain_samples(
                self._eval_itr,
                test_episodes_per_task * self._max_episode_length,
                adapted_policy)
            adapted_episodes.append(adapted_eps)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                EpisodeBatch.concatenate(*adapted_episodes),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1
    def evaluate(self, algo, test_episodes_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (MetaRLAlgorithm): The algorithm to evaluate.
            test_episodes_per_task (int or None): Number of episodes per task.

        """
        if test_episodes_per_task is None:
            test_episodes_per_task = self._n_test_episodes
        adapted_episodes = []
        logger.log('Sampling for adapation and meta-testing...')
        env_updates = self._test_task_sampler.sample(self._n_test_tasks)

        for env_up in env_updates:
            policy = algo.get_exploration_policy()
            eps = EpisodeBatch.concatenate(*[
                algo._sampler.obtain_samples(self._eval_itr, 1,
                                             policy,
                                             env_up)
                for _ in range(self._n_exploration_eps)
            ])
            adapted_policy = algo.get_adapted_test_policy(policy, eps)
            adapted_eps = algo._sampler.obtain_samples(
                self._eval_itr,
                test_episodes_per_task * env_up().spec.max_episode_length,
                adapted_policy)
            adapted_episodes.append(adapted_eps)
        if self._verbose:
            for ep in adapted_episodes:
                print(ep.env_infos['task'][0])
                print(f'last observations: {ep.last_observations}')
                print('------------------------------------')

        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                EpisodeBatch.concatenate(*adapted_episodes),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1

        return adapted_episodes
示例#9
0
文件: diayn.py 项目: fangqyi/garage
    def _log_performance(self, itr, batch, discount, prefix='Evaluation'):
        self_returns = []
        env_returns = []
        undiscounted_self_returns = []
        undiscounted_env_returns = []
        completion = []
        success = []
        for trajectory in batch.split():
            self_returns.append(
                discount_cumsum(trajectory.self_rewards, discount))
            env_returns.append(
                discount_cumsum(trajectory.env_rewards, discount))
            undiscounted_self_returns.append(sum(trajectory.self_rewards))
            undiscounted_env_returns.append(sum(trajectory.env_rewards))
            completion.append(float(trajectory.terminals.any()))
            if 'success' in trajectory.env_infos:
                success.append(float(trajectory.env_infos['success'].any()))

        average_discounted_self_return = np.mean(
            [rtn[0] for rtn in self_returns])
        average_discounted_env_return = np.mean(
            [rtn[0] for rtn in env_returns])

        with tabular.prefix(prefix + '/'):
            tabular.record('Iteration', itr)
            tabular.record('NumTrajs', len(self_returns))
            # pseudo reward
            tabular.record('AverageDiscountedSelfReturn',
                           average_discounted_self_return)
            tabular.record('AverageSelfReturn',
                           np.mean(undiscounted_self_returns))
            tabular.record('StdSelfReturn', np.std(undiscounted_self_returns))
            tabular.record('MaxSelfReturn', np.max(undiscounted_self_returns))
            tabular.record('MinSelfReturn', np.min(undiscounted_self_returns))
            # env reward
            tabular.record('AverageDiscountedEnvReturn',
                           average_discounted_env_return)
            tabular.record('AverageEnvReturn',
                           np.mean(undiscounted_env_returns))
            tabular.record('StdEnvReturn', np.std(undiscounted_env_returns))
            tabular.record('MaxEnvReturn', np.max(undiscounted_env_returns))
            tabular.record('MinEnvReturn', np.min(undiscounted_env_returns))

            tabular.record('CompletionRate', np.mean(completion))
            if success:
                tabular.record('SuccessRate', np.mean(success))

        return undiscounted_self_returns, undiscounted_env_returns
示例#10
0
def log_performance(itr,
                    batch,
                    discount,
                    trajectory_class=TrajectoryBatch,
                    prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of trajectories.

    Args:
        itr (int): Iteration number.
        batch (TrajectoryBatch): The trajectories to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    completion = []
    success = []
    for trajectory in batch.split():
        if trajectory_class == TrajectoryBatch:
            returns.append(discount_cumsum(trajectory.rewards, discount))
            undiscounted_returns.append(sum(trajectory.rewards))
        else:
            returns.append(discount_cumsum(trajectory.env_rewards, discount))
            undiscounted_returns.append(sum(trajectory.env_rewards))
        completion.append(float(trajectory.terminals.any()))
        if 'success' in trajectory.env_infos:
            success.append(float(trajectory.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumTrajs', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('CompletionRate', np.mean(completion))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
示例#11
0
    def train_once(self, itr, paths):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        obs, actions, rewards, valids, baselines = self.process_samples(
            itr, paths)

        loss = self._compute_loss(itr, obs, actions, rewards, valids,
                                  baselines)

        self._old_policy.load_state_dict(self.policy.state_dict())

        self._optimizer.zero_grad()
        loss.backward()

        kl_before = self._compute_kl_constraint(obs).detach()
        self._optimize(itr, obs, actions, rewards, valids, baselines)

        with torch.no_grad():
            loss_after = self._compute_loss(itr, obs, actions, rewards, valids,
                                            baselines)
            kl = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        average_returns = log_performance(itr,
                                          TrajectoryBatch.from_trajectory_list(
                                              self.env_spec, paths),
                                          discount=self.discount)

        with tabular.prefix(self.policy.name):
            tabular.record('LossBefore', loss.item())
            tabular.record('LossAfter', loss_after.item())
            tabular.record('dLoss', loss.item() - loss_after.item())
            tabular.record('KLBefore', kl_before.item())
            tabular.record('KL', kl.item())
            tabular.record('Entropy', policy_entropy.mean().item())

        self.baseline.fit(paths)
        return np.mean(average_returns)
示例#12
0
def log_multitask_performance(itr,
                              batch,
                              discount,
                              name_map={},
                              task_names=None):
    traj_by_name = defaultdict(list)
    for trajectory in batch.split():
        try:
            task_name = trajectory.env_infos['task_name'][0]
        except KeyError:
            try:
                task_id = trajectory.env_infos['task_id'][0]
                task_name = name_map[task_id]
            except KeyError:
                task_name = 'Task #{}'.format(task_id)
        traj_by_name[task_name].append(trajectory)
    if task_names is None:
        for (task_name, trajectories) in traj_by_name.items():
            log_performance(itr,
                            metarl.TrajectoryBatch.concatenate(*trajectories),
                            discount,
                            prefix=task_name)
    else:
        for task_name in sorted(task_names):
            if task_name in traj_by_name:
                trajectories = traj_by_name[task_name]
                log_performance(
                    itr,
                    metarl.TrajectoryBatch.concatenate(*trajectories),
                    discount,
                    prefix=task_name)
            else:
                with tabular.prefix(task_name + '/'):
                    tabular.record('Iteration', -1)
                    tabular.record('NumTrajs', -1)
                    tabular.record('AverageDiscountedReturn', -1.)
                    tabular.record('AverageReturn', -1)
                    tabular.record('StdReturn', -1)
                    tabular.record('MaxReturn', -1)
                    tabular.record('MinReturn', -1)
                    tabular.record('CompletionRate', -1)
                    tabular.record('SuccessRate', -1)

    return log_performance(itr, batch, discount=discount, prefix="Average")
示例#13
0
def log_performance(itr, batch, discount, prefix='Evaluation'):
    """Evaluate the performance of an algorithm on a batch of episodes.

    Args:
        itr (int): Iteration number.
        batch (EpisodeBatch): The episodes to evaluate with.
        discount (float): Discount value, from algorithm's property.
        prefix (str): Prefix to add to all logged keys.

    Returns:
        numpy.ndarray: Undiscounted returns.

    """
    returns = []
    undiscounted_returns = []
    termination = []
    success = []
    for eps in batch.split():
        returns.append(discount_cumsum(eps.rewards, discount))
        undiscounted_returns.append(sum(eps.rewards))
        termination.append(
            float(
                any(step_type == StepType.TERMINAL
                    for step_type in eps.step_types)))
        if 'success' in eps.env_infos:
            success.append(float(eps.env_infos['success'].any()))

    average_discounted_return = np.mean([rtn[0] for rtn in returns])

    with tabular.prefix(prefix + '/'):
        tabular.record('Iteration', itr)
        tabular.record('NumEpisodes', len(returns))

        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        tabular.record('TerminationRate', np.mean(termination))
        if success:
            tabular.record('SuccessRate', np.mean(success))

    return undiscounted_returns
示例#14
0
文件: vpg.py 项目: wjssx/garage
    def _log(self, itr, paths, loss_before, loss_after, kl_before, kl,
             policy_entropy):
        """Log information per iteration based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths
            loss_before (float): Loss before optimization step.
            loss_after (float): Loss after optimization step.
            kl_before (float): KL divergence before optimization step.
            kl (float): KL divergence after optimization step.
            policy_entropy (float): Policy entropy.

        Returns:
            float: The average return in last epoch cycle.

        """
        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))
        undiscounted_returns = [sum(path['rewards']) for path in paths]
        average_return = np.mean(undiscounted_returns)
        self._episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', average_return)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))
        with tabular.prefix(self.policy.name):
            tabular.record('LossBefore', loss_before)
            tabular.record('LossAfter', loss_after)
            tabular.record('dLoss', loss_before - loss_after)
            tabular.record('KLBefore', kl_before)
            tabular.record('KL', kl)
            tabular.record('Entropy', policy_entropy)

        return average_return
示例#15
0
    def train(self, runner):
        """Obtain samplers and start actual training for each epoch.

        Args:
            runner (LocalRunner): LocalRunner is passed to give algorithm
                the access to runner.step_epochs(), which provides services
                such as snapshotting and sampler control.

        """
        if not self._eval_env:
            self._eval_env = runner.get_env_copy()
        for epoch in runner.step_epochs():
            if self._eval_env is not None:
                log_performance(epoch,
                                obtain_evaluation_samples(
                                    self.learner, self._eval_env),
                                discount=1.0)
            losses = self._train_once(runner, epoch)
            with tabular.prefix(self._name + '/'):
                tabular.record('MeanLoss', np.mean(losses))
                tabular.record('StdLoss', np.std(losses))
示例#16
0
    def _train_once(self, itr, eps):
        """Train the algorithm once.

        Args:
            itr (int): Iteration number.
            eps (EpisodeBatch): A batch of collected paths.

        Returns:
            numpy.float64: Calculated mean value of undiscounted returns.

        """
        obs = torch.Tensor(eps.padded_observations)
        rewards = torch.Tensor(eps.padded_rewards)
        returns = torch.Tensor(
            np.stack([
                discount_cumsum(reward, self.discount)
                for reward in eps.padded_rewards
            ]))
        valids = eps.lengths
        with torch.no_grad():
            baselines = self._value_function(obs)

        if self._maximum_entropy:
            policy_entropies = self._compute_policy_entropy(obs)
            rewards += self._policy_ent_coeff * policy_entropies

        obs_flat = torch.Tensor(eps.observations)
        actions_flat = torch.Tensor(eps.actions)
        rewards_flat = torch.Tensor(eps.rewards)
        returns_flat = torch.cat(filter_valids(returns, valids))
        advs_flat = self._compute_advantage(rewards, valids, baselines)

        with torch.no_grad():
            policy_loss_before = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_before = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_before = self._compute_kl_constraint(obs)

        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
                    advs_flat)

        with torch.no_grad():
            policy_loss_after = self._compute_loss_with_adv(
                obs_flat, actions_flat, rewards_flat, advs_flat)
            vf_loss_after = self._value_function.compute_loss(
                obs_flat, returns_flat)
            kl_after = self._compute_kl_constraint(obs)
            policy_entropy = self._compute_policy_entropy(obs)

        with tabular.prefix(self.policy.name):
            tabular.record('/LossBefore', policy_loss_before.item())
            tabular.record('/LossAfter', policy_loss_after.item())
            tabular.record('/dLoss',
                           (policy_loss_before - policy_loss_after).item())
            tabular.record('/KLBefore', kl_before.item())
            tabular.record('/KL', kl_after.item())
            tabular.record('/Entropy', policy_entropy.mean().item())

        with tabular.prefix(self._value_function.name):
            tabular.record('/LossBefore', vf_loss_before.item())
            tabular.record('/LossAfter', vf_loss_after.item())
            tabular.record('/dLoss',
                           vf_loss_before.item() - vf_loss_after.item())

        self._old_policy.load_state_dict(self.policy.state_dict())

        undiscounted_returns = log_performance(itr,
                                               eps,
                                               discount=self._discount)
        return np.mean(undiscounted_returns)
示例#17
0
    def log_performance(self, indices, test, epoch):
        """Get average returns for specific tasks.

        Args:
            indices (list): List of tasks.

        """
        discounted_returns = []
        undiscounted_returns = []
        completion = []
        success = []
        traj = []
        for idx in indices:
            eval_paths = []
            for _ in range(self._num_evals):
                paths = self.collect_paths(idx, test)
                paths[-1]['terminals'] = paths[-1]['terminals'].squeeze()
                paths[-1]['dones'] = paths[-1]['terminals']
                # HalfCheetahVel env
                if 'task' in paths[-1]['env_infos'].keys():
                    paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][
                        'task']['velocity']
                eval_paths.append(paths[-1])
                discounted_returns.append(
                    discount_cumsum(paths[-1]['rewards'], self._discount))
                undiscounted_returns.append(sum(paths[-1]['rewards']))
                completion.append(float(paths[-1]['terminals'].any()))
                # calculate success rate for metaworld tasks
                if 'success' in paths[-1]['env_infos']:
                    success.append(paths[-1]['env_infos']['success'].any())

            if test:
                env = self.test_env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            else:
                env = self.env[idx]()
                temp_traj = TrajectoryBatch.from_trajectory_list(
                    env, eval_paths)
            traj.append(temp_traj)

        if test:
            with tabular.prefix('Test/'):
                if self._test_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._test_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')
        else:
            with tabular.prefix('Train/'):
                if self._train_task_names:
                    log_multitask_performance(
                        epoch,
                        TrajectoryBatch.concatenate(*traj),
                        self._discount,
                        task_names=self._train_task_names)
                log_performance(epoch,
                                TrajectoryBatch.concatenate(*traj),
                                self._discount,
                                prefix='Average')