예제 #1
0
def get_returns(episodes):
    # print('!!!!!!!!!!!!!!!!!')
    # for episode in episodes:
    #     print(episode.rewards.shape)
    #     print(np.where(episode.rewards.numpy() == 1))
    ans = to_numpy([episode.rewards.sum(dim=0) for episode in episodes])
    return ans
예제 #2
0
def get_returns(tasks, episodes):
    ret = [
        -np.linalg.norm(np.array(episode.observations) -
                        np.expand_dims(tasks[taskIdx]['goal'], 0),
                        axis=2).sum(0)
        for taskIdx, episode in enumerate(episodes)
    ]
    return to_numpy(ret)
예제 #3
0
    def step(self,
             train_futures,
             valid_futures,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        num_tasks = len(train_futures[0])
        logs = {}
        # Compute the surrogate loss
        old_losses, old_kls, old_pis = self._async_gather([
            self.surrogate_loss(train, valid, old_pi=None)
            for (train, valid) in zip(zip(*train_futures), valid_futures)
        ])

        logs['loss_before'] = to_numpy(old_losses)
        logs['kl_before'] = to_numpy(old_kls)

        old_loss = sum(old_losses) / num_tasks
        grads = torch.autograd.grad(old_loss,
                                    self.policy.parameters(),
                                    retain_graph=True)
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        old_kl = sum(old_kls) / num_tasks
        hessian_vector_product = self.hessian_vector_product(
            old_kl, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(
            stepdir, hessian_vector_product(stepdir, retain_graph=False))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())

            losses, kls, _ = self._async_gather([
                self.surrogate_loss(train, valid, old_pi=old_pi)
                for (train, valid, old_pi) in zip(zip(
                    *train_futures), valid_futures, old_pis)
            ])

            improve = (sum(losses) / num_tasks) - old_loss
            kl = sum(kls) / num_tasks
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                logs['loss_after'] = to_numpy(losses)
                logs['kl_after'] = to_numpy(kls)
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())

        return logs
예제 #4
0
    def step(self,
             # futures=(train_episodes_futures, valid_episodes_futures)
             train_futures,
             valid_futures,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        # 计算任务数量
        num_tasks = len(train_futures[0])
        logs = {}

        """
        Compute the surrogate loss
        针对每一个 task, 计算 train 和 valid 对,对应的参数之类的
        此处的policy 可以是 GradientBasedMetaLearner 中继承至MultiTaskSampler的采样前的policy
        应该也可以设置为 MAMLTRPO 中的 original_policy,该policy直接来自原始的传参
        有待进一步验证两者 policy 是否一致
        """
        # 此处语句作用是 按 task 依次计算 surrogate_loss() 损失
        train_loss, valid_loss = self._async_gather([
            self.surrogate_loss(train,
                                valid,
                                old_pi=None)
            for (train, valid) in zip(zip(*train_futures), valid_futures)])

        logs['train_loss'] = to_numpy(train_loss)
        logs['valid_loss'] = to_numpy(valid_loss)

        """
        # 计算平均误差,输出为标量
        old_loss = sum(old_losses) / num_tasks
        grads = torch.autograd.grad(old_loss,
                                    self.policy.parameters(),
                                    retain_graph=True)
        grads = parameters_to_vector(grads)

        # Compute the step direction with Conjugate Gradient
        # 计算平均误差,输出为标量
        old_kl = sum(old_kls) / num_tasks
        hessian_vector_product = self.hessian_vector_product(old_kl,
                                                             damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(stepdir,
                              hessian_vector_product(stepdir, retain_graph=False))
        lagrange_multiplier = torch.sqrt(shs / max_kl)

        step = stepdir / lagrange_multiplier

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # vector_to_parameter( * , self.policy.parameters()) 就是对网络参数的更新
        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())

            losses, kls, _ = self._async_gather([
                self.surrogate_loss(train, valid, old_pi=old_pi)
                for (train, valid, old_pi)
                in zip(zip(*train_futures), valid_futures, old_pis)])

            improve = (sum(losses) / num_tasks) - old_loss
            kl = sum(kls) / num_tasks
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                logs['loss_after'] = to_numpy(losses)
                logs['kl_after'] = to_numpy(kls)
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())

        # 查看最终神经网络参数
        params_final = self.policy.parameters()

        """
        # logs['loss_before', 'kl_before', 'loss_after', 'kl_after']
        return logs
예제 #5
0
def get_returns(episodes):
    return to_numpy([episode.rewards.sum(dim=0) for episode in episodes])