Пример #1
0
    def natural_grad_qu(model, n_iter=1, step_size=step_rate, momentum=0.0):
        global mk_ant, mk_aux, mk, V_i, Vk, Lk, Vk, Vki_ant
        """"Initialize the step-sizes""" ""
        beta2_k = step_size  #use step_size*0.1 for Convolutional MOGP
        gamma2_k = momentum
        alpha2_k = step_size
        N_posteriors = model.q_u_means.shape[1]

        if n_iter == 1:
            V_i = choleskies.multiple_dpotri(
                choleskies.flat_to_triang(model.q_u_chols.values)).copy()
            Vk = np.zeros_like(V_i)
            for i in range(N_posteriors):
                Vk[i, :, :] = 0.5 * (model.posteriors[i].covariance.copy() +
                                     model.posteriors[i].covariance.T.copy())

            Lk = np.zeros_like(Vk)
            mk = model.q_u_means.values.copy()

            Vki_ant = V_i.copy()
            mk_aux = mk.copy()

        dL_dm, dL_dV = compute_stoch_grads_for_qu_HetMOGP(model=model)

        mk_ant = mk_aux.copy()
        mk_aux = mk.copy()

        if not model.q_u_means.is_fixed and not model.q_u_chols.is_fixed:
            mk_ant = mk_aux.copy()
            mk_aux = mk.copy()

            for i in range(N_posteriors):
                try:
                    V_i[i, :, :] = V_i[i, :, :] + 2 * beta2_k * dL_dV[
                        i]  #+ 1.0e-6*np.eye(*Vk[i,:,:].shape)
                    Vk[i, :, :] = np.linalg.inv(V_i[i, :, :])
                    Vk[i, :, :] = 0.5 * (np.array(Vk[i, :, :]) +
                                         np.array(Vk[i, :, :].T))
                    Lk[i, :, :] = np.linalg.cholesky(Vk[i, :, :])
                    mk[:, i] = mk[:, i] - alpha2_k * np.dot(
                        Vk[i, :, :], dL_dm[i]) + gamma2_k * np.dot(
                            np.dot(Vk[i, :, :], Vki_ant[i, :, :]),
                            (mk[:, i] - mk_ant[:, i]))
                except LinAlgError:
                    print("Overflow")
                    Vk[i, :, :] = np.linalg.inv(V_i[i, :, :])
                    Vk[i, :, :] = 1.0e-1 * np.eye(
                        *Vk[i, :, :].shape
                    )  #nearestPD(Vk[i,:,:]) # + 1.0e-3*np.eye(*Vk[i,:,:].shape)
                    Lk[i, :, :] = linalg.jitchol(Vk[i, :, :])
                    V_i[i, :, :] = np.linalg.inv(Vk[i, :, :])
                    mk[:, i] = mk[:, i] * 0.0

            Vki_ant = V_i.copy()

            model.L_u.setfield(choleskies.triang_to_flat(Lk.copy()),
                               np.float64)
            model.m_u.setfield(mk.copy(), np.float64)
Пример #2
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern_list_uq,
                 kern_list_Gx,
                 kern_list_Tq,
                 likelihood,
                 Y_metadata,
                 name='ConvHetMOGP_VIK',
                 batch_size=None):

        self.batch_size = batch_size
        self.kern_list = kern_list_uq
        self.likelihood = likelihood
        self.Y_metadata = Y_metadata
        self.kern_list_Gdj = kern_list_Gx
        self.kern_list_Tq = kern_list_Tq

        self.num_inducing = Z.shape[0]  # M
        self.num_latent_funcs = len(kern_list_uq)  # Q
        self.num_output_funcs = likelihood.num_output_functions(
            self.Y_metadata)  #This is the number J in the paper
        self.W_list, self.kappa_list = util.random_W_kappas(
            self.num_latent_funcs, self.num_output_funcs, rank=1)

        check_ARD_uq = [kern.lengthscale.shape[0] > 1 for kern in kern_list_uq]
        check_ARD_Gx = [
            kern.lengthscale.shape[0] > 1 for kern in kern_list_Gx
        ]  # This is just to verify Automatic Relevance Determination
        check_ARD_Tq = [kern.lengthscale.shape[0] > 1 for kern in kern_list_Tq]
        if (sum(check_ARD_uq) == 0) and (sum(check_ARD_Gx)
                                         == 0) and (sum(check_ARD_Tq) == 0):
            isARD = False
        elif (sum(check_ARD_uq) == check_ARD_uq.__len__()) and (
                sum(check_ARD_Gx)
                == check_ARD_Gx.__len__()) and (sum(check_ARD_Tq)
                                                == check_ARD_Tq.__len__()):
            isARD = True
        else:
            print(
                '\nAll kernel_lists for Uq, Gx and Tx have to coincide in Automatic Relevance Determination,'
            )
            print('All kernel_lists have to coincide: ARD=True or ARD=False\n')
            assert (sum(check_ARD_uq) == check_ARD_uq.__len__()) and (
                sum(check_ARD_Gx)
                == check_ARD_Gx.__len__()) and (sum(check_ARD_Tq)
                                                == check_ARD_Tq.__len__())

        self.kern_aux = GPy.kern.RBF(
            input_dim=Z.shape[1],
            lengthscale=1.0,
            variance=1.0,
            name='rbf_aux',
            ARD=isARD) + GPy.kern.White(input_dim=Z.shape[1])
        self.kern_aux.white.variance = 1e-6

        self.Xmulti = X
        self.Ymulti = Y

        # Batch the data
        self.Xmulti_all, self.Ymulti_all = X, Y
        if batch_size is None:
            #self.stochastic = False
            Xmulti_batch, Ymulti_batch = X, Y
        else:
            # Makes a climin slicer to make drawing minibatches much quicker
            #self.stochastic = False   #"This was True as Pablo had it"
            self.slicer_list = []
            [
                self.slicer_list.append(
                    draw_mini_slices(Xmulti_task.shape[0], self.batch_size))
                for Xmulti_task in self.Xmulti
            ]
            Xmulti_batch, Ymulti_batch = self.new_batch()
            self.Xmulti, self.Ymulti = Xmulti_batch, Ymulti_batch

        # Initialize inducing points Z
        self.Xdim = Z.shape[1]
        Z = np.tile(Z, (1, self.num_latent_funcs))
        inference_method = SVMOGPInf()

        super(ConvHetMOGP_VIK,
              self).__init__(X=Xmulti_batch[0][1:10],
                             Y=Ymulti_batch[0][1:10],
                             Z=Z,
                             kernel=kern_list_uq[0],
                             likelihood=likelihood,
                             mean_function=None,
                             X_variance=None,
                             inference_method=inference_method,
                             Y_metadata=Y_metadata,
                             name=name,
                             normalizer=False)

        self.unlink_parameter(
            self.kern)  # Unlink SparseGP default param kernel

        _, self.B_list = util.LCM(input_dim=self.Xdim,
                                  output_dim=self.num_output_funcs,
                                  rank=1,
                                  kernels_list=self.kern_list,
                                  W_list=self.W_list,
                                  kappa_list=self.kappa_list)

        # Set-up optimization parameters: [Z, m_u, L_u]
        self.q_u_means = [
            Param(
                'm_u' + str(dj), 10.0 *
                np.random.randn(self.num_inducing, self.num_latent_funcs) +
                10.0 * np.tile(np.random.randn(1, self.num_latent_funcs),
                               (self.num_inducing, 1)))
            for dj in range(self.num_output_funcs)
        ]
        chols = choleskies.triang_to_flat(
            np.tile(3 * np.eye(self.num_inducing)[None, :, :],
                    (self.num_latent_funcs, 1, 1)))
        self.q_u_chols = Param('L_u', chols)

        self.link_parameter(self.Z, index=0)
        [self.link_parameter(q_u_means) for q_u_means in self.q_u_means]
        self.link_parameters(self.q_u_chols)
        [self.link_parameter(kern_q)
         for kern_q in kern_list_uq]  # link all kernels
        [self.link_parameter(B_q) for B_q in self.B_list]
        [self.link_parameter(kern_list_Gjd) for kern_list_Gjd in kern_list_Gx]
        [self.link_parameter(kern_list_Tq) for kern_list_Tq in kern_list_Tq]
        #self.link_parameter(self.kern_aux.white.variance)

        self.vem_step = True  # [True=VE-step, False=VM-step]
        self.ve_count = 0
        self.elbo = np.zeros((1, 1))
        self.index_VEM = 0  #this is a variable to index correctly the self.elbo when using VEM
        self.Gauss_Newton = False  #This is a flag for using the Gauss-Newton approximation when dL_dV is needed

        for kern_q in self.kern_list:
            kern_q.variance = 1.0
            kern_q.variance.fix()
        for kern_Gjd in self.kern_list_Gdj:
            kern_Gjd.variance = 1.0
            kern_Gjd.variance.fix()
            #print('IN fix Gdj')
        for kern_Tq in self.kern_list_Tq:
            kern_Tq.variance = 1.0
            kern_Tq.variance.fix()
Пример #3
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern_list,
                 likelihood,
                 Y_metadata,
                 name='SVMOGP',
                 batch_size=None):

        self.batch_size = batch_size
        self.kern_list = kern_list
        self.likelihood = likelihood
        self.Y_metadata = Y_metadata

        self.num_inducing = Z.shape[0]  # M
        self.num_latent_funcs = len(kern_list)  # Q
        self.num_output_funcs = likelihood.num_output_functions(
            self.Y_metadata)
        self.W_list, self.kappa_list = util.random_W_kappas(
            self.num_latent_funcs, self.num_output_funcs, rank=1)

        self.Xmulti = X
        self.Ymulti = Y

        # Batch the data
        self.Xmulti_all, self.Ymulti_all = X, Y
        if batch_size is None:
            self.stochastic = False
            Xmulti_batch, Ymulti_batch = X, Y
        else:
            # Makes a climin slicer to make drawing minibatches much quicker
            self.stochastic = True
            self.slicer_list = []
            [
                self.slicer_list.append(
                    draw_mini_slices(Xmulti_task.shape[0], self.batch_size))
                for Xmulti_task in self.Xmulti
            ]
            Xmulti_batch, Ymulti_batch = self.new_batch()
            self.Xmulti, self.Ymulti = Xmulti_batch, Ymulti_batch

        # Initialize inducing points Z
        #Z = kmm_init(self.X_all, self.num_inducing)
        self.Xdim = Z.shape[1]
        Z = np.tile(Z, (1, self.num_latent_funcs))

        inference_method = SVMOGPInf()

        super(SVMOGP, self).__init__(X=Xmulti_batch[0][1:10],
                                     Y=Ymulti_batch[0][1:10],
                                     Z=Z,
                                     kernel=kern_list[0],
                                     likelihood=likelihood,
                                     mean_function=None,
                                     X_variance=None,
                                     inference_method=inference_method,
                                     Y_metadata=Y_metadata,
                                     name=name,
                                     normalizer=False)

        self.unlink_parameter(
            self.kern)  # Unlink SparseGP default param kernel

        _, self.B_list = util.LCM(input_dim=self.Xdim,
                                  output_dim=self.num_output_funcs,
                                  rank=1,
                                  kernels_list=self.kern_list,
                                  W_list=self.W_list,
                                  kappa_list=self.kappa_list)

        # Set-up optimization parameters: [Z, m_u, L_u]
        self.q_u_means = Param(
            'm_u',
            5 * np.random.randn(self.num_inducing, self.num_latent_funcs) +
            np.tile(np.random.randn(1, self.num_latent_funcs),
                    (self.num_inducing, 1)))
        chols = choleskies.triang_to_flat(
            np.tile(
                np.eye(self.num_inducing)[None, :, :],
                (self.num_latent_funcs, 1, 1)))
        self.q_u_chols = Param('L_u', chols)

        self.link_parameter(self.Z, index=0)
        self.link_parameter(self.q_u_means)
        self.link_parameters(self.q_u_chols)
        [self.link_parameter(kern_q)
         for kern_q in kern_list]  # link all kernels
        [self.link_parameter(B_q) for B_q in self.B_list]

        self.vem_step = True  # [True=VE-step, False=VM-step]
        self.ve_count = 0
        self.elbo = np.zeros((1, 1))
    def calculate_gradients(self, q_U, S_u, Su_add_Kuu_chol, p_U, q_F, VE_dm,
                            VE_dv, Ntask, M, Q, D, f_index, d_index, q):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u) and p(u):
        m_u = q_U.mu_u.copy()
        #L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        #S_u = np.empty((Q, M, M))
        #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Kuu = p_U.Kuu.copy()
        Luu = p_U.Luu.copy()
        Kuui = p_U.Kuui.copy()
        S_qi, _ = linalg.dpotri(np.asfortranarray(Su_add_Kuu_chol[q, :, :]))

        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # KL Terms
        dKL_dmu_q = []
        dKL_dKqq = 0
        for d in range(D):
            dKL_dmu_q.append(np.dot(Kuui[q, :, :], m_u[d][:, q, None]))  #same
            dKL_dKqq += -0.5 * S_qi + 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \
                       - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[d][:, q, None], m_u[d][:, q, None].T)).dot(Kuui[q, :, :].T)  # same
        #dKL_dS_q = 0.5 * (Kuui[q,:,:] - S_qi)             #old
        dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) * D

        # VE Terms
        #dVE_dmu_q = np.zeros((M, 1))
        dVE_dmu_q = []
        dVE_dS_q = np.zeros((M, M))
        dVE_dKqq = np.zeros((M, M))
        dVE_dKqd = []
        dVE_dKdiag = []
        dL_dmu_q = []

        for d, q_fd in enumerate(q_F):
            Nt = Ntask[f_index[d]]
            dVE_dmu_q.append(
                np.dot(q_fd.Afdu[q, :, :].T,
                       VE_dm[f_index[d]][:, d_index[d]])[:, None])
            dL_dmu_q.append(dVE_dmu_q[d] - dKL_dmu_q[d])
            Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d],
                                                           None].T
            Adv = np.ascontiguousarray(Adv)
            AdvA = np.dot(Adv.reshape(-1, Nt),
                          q_fd.Afdu[q, :, :]).reshape(M, M)
            dVE_dS_q += AdvA

            # Derivatives dKuquq
            tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :])
            dVE_dKqq += -tmp_dv - tmp_dv.T  #+ AdvA last term not included in the derivative
            Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d],
                                                                 None])
            dVE_dKqq += -np.dot(Adm,
                                np.dot(Kuui[q, :, :], m_u[d][:, q, None]).T)

            # Derivatives dKuqfd
            tmp = np.dot(S_u[q, :, :], Kuui[q, :, :])
            tmp = 2. * tmp  #2. * (tmp - np.eye(M))  # the term -2Adv not included
            dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[d][:, q, None]),
                             VE_dm[f_index[d]][:, d_index[d], None].T)
            dve_kqd += np.dot(tmp.T, Adv)
            dVE_dKqd.append(dve_kqd)

            # Derivatives dKdiag
            dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]])

        dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T)
        # Sum of VE and KL terms
        #dL_dmu_q = dVE_dmu_q - dKL_dmu_q
        dL_dS_q = dVE_dS_q - dKL_dS_q
        dL_dKqq = dVE_dKqq - dKL_dKqq
        dL_dKdq = dVE_dKqd
        dL_dKdiag = dVE_dKdiag

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1])
        dL_dL_q = 2. * np.array(
            [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)])
        dL_dL_q = choleskies.triang_to_flat(dL_dL_q)

        # Posterior
        posterior_q = []
        for d in range(D):
            posterior_q.append(
                Posterior(mean=m_u[d][:, q, None],
                          cov=S_u[q, :, :] + Kuu[q, :, :],
                          K=Kuu[q, :, :],
                          prior_mean=np.zeros(m_u[d][:, q, None].shape)))

        return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
Пример #5
0
    def calculate_gradients(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D,
                            f_index, d_index, j):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u) and p(u):
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        #S_u = np.empty((Q, M, M))
        S_u = np.dot(
            L_u[j, :, :], L_u[j, :, :].T
        )  #This could be done outside and recieve it to reduce computation
        #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Kuu = p_U.Kuu.copy()
        Luu = p_U.Luu.copy()
        Kuui = p_U.Kuui.copy()
        S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[j, :, :]))

        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # KL Terms
        dKL_dmu_j = np.dot(Kuui[j, :, :], m_u[:, j, None])
        dKL_dS_j = 0.5 * (Kuui[j, :, :] - S_qi)
        dKL_dKjj = 0.5 * Kuui[j,:,:] - 0.5 * Kuui[j,:,:].dot(S_u).dot(Kuui[j,:,:]) \
                   - 0.5 * np.dot(Kuui[j,:,:],np.dot(m_u[:, j, None],m_u[:, j, None].T)).dot(Kuui[j,:,:].T)

        # VE Terms
        dVE_dmu_j = np.zeros((M, 1))
        dVE_dS_j = np.zeros((M, M))
        dVE_dKjj = np.zeros((M, M))
        dVE_dKjd = []
        dVE_dKdiag = []

        Nt = Ntask[f_index[j]]
        dVE_dmu_j += np.dot(q_F[j].Afdu.T,
                            VE_dm[f_index[j]][:, d_index[j]])[:, None]
        Adv = q_F[j].Afdu.T * VE_dv[f_index[j]][:, d_index[j], None].T
        Adv = np.ascontiguousarray(Adv)
        AdvA = np.dot(Adv.reshape(-1, Nt), q_F[j].Afdu).reshape(M, M)
        dVE_dS_j += AdvA

        # Derivatives dKuquq
        tmp_dv = np.dot(AdvA, S_u).dot(Kuui[j, :, :])
        dVE_dKjj += AdvA - tmp_dv - tmp_dv.T
        Adm = np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j], None])
        dVE_dKjj += -np.dot(Adm, np.dot(Kuui[j, :, :], m_u[:, j, None]).T)

        # Derivatives dKuqfd
        tmp = np.dot(S_u, Kuui[j, :, :])
        tmp = 2. * (tmp - np.eye(M))
        dve_kjd = np.dot(np.dot(Kuui[j, :, :], m_u[:, j, None]),
                         VE_dm[f_index[j]][:, d_index[j], None].T)
        dve_kjd += np.dot(tmp.T, Adv)
        dVE_dKjd.append(dve_kjd)

        # Derivatives dKdiag
        dVE_dKdiag.append(VE_dv[f_index[j]][:, d_index[j]])

        dVE_dKjj = 0.5 * (dVE_dKjj + dVE_dKjj.T)
        # Sum of VE and KL terms
        dL_dmu_j = dVE_dmu_j - dKL_dmu_j
        dL_dS_j = dVE_dS_j - dKL_dS_j
        dL_dKjj = dVE_dKjj - dKL_dKjj
        dL_dKdj = dVE_dKjd[0].copy()  #Here we just pass the unique position
        dL_dKdiag = dVE_dKdiag[0].copy(
        )  #Here we just pass the unique position

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_j = choleskies.flat_to_triang(chol_u[:, j:j + 1])
        dL_dL_j = 2. * np.array(
            [np.dot(a, b) for a, b in zip(dL_dS_j[None, :, :], L_j)])
        dL_dL_j = choleskies.triang_to_flat(dL_dL_j)

        # Posterior
        posterior_j = Posterior(mean=m_u[:, j, None],
                                cov=S_u,
                                K=Kuu[j, :, :],
                                prior_mean=np.zeros(m_u[:, j, None].shape))

        return dL_dmu_j, dL_dL_j, dL_dS_j, posterior_j, dL_dKjj, dL_dKdj, dL_dKdiag
Пример #6
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern_list,
                 likelihood,
                 mean_functions=None,
                 name='SVGPMulti',
                 Y_metadata=None,
                 batchsize=None):
        """
        Extension to the SVGP to allow multiple latent function,
        where the latent functions are assumed independant (have one kernel per latent function)
        """
        # super(SVGPMulti, self).__init__(name)  # Parameterized.__init__(self)

        assert X.ndim == 2
        self.Y_metadata = Y_metadata
        _, self.output_dim = Y.shape

        # self.Z = Param('inducing inputs', Z)
        # self.num_inducing = Z.shape[0]
        # self.likelihood = likelihood

        self.kern_list = kern_list
        self.batchsize = batchsize

        #Batch the data
        self.X_all, self.Y_all = X, Y
        if batchsize is None:
            X_batch, Y_batch = X, Y
        else:
            import climin.util
            #Make a climin slicer to make drawing minibatches much quicker
            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0],
                                                       self.batchsize)
            X_batch, Y_batch = self.new_batch()

        # if isinstance(X_batch, (ObsAr, VariationalPosterior)):
        # self.X = X_batch.copy()
        # else:
        # self.X = ObsAr(X_batch)
        # self.Y = Y_batch

        #create the SVI inference method
        # self.inference_method = svgp_inf()
        inference_method = svgp_inf()

        #Initialize base model
        super(SVGPMulti, self).__init__(X=X_batch,
                                        Y=Y_batch,
                                        Z=Z,
                                        kernel=kern_list[0],
                                        likelihood=likelihood,
                                        mean_function=None,
                                        X_variance=None,
                                        inference_method=inference_method,
                                        name=name,
                                        Y_metadata=Y_metadata,
                                        normalizer=False)
        self.unlink_parameter(self.kern)  # We don't want a single kern

        # self.num_data, self.input_dim = self.X.shape
        self.num_outputs = self.Y.shape[1]

        self.num_latent_funcs = self.likelihood.request_num_latent_functions(
            self.Y_all)

        #Make a latent function per dimension
        self.q_u_means = Param(
            'q_u_means', np.zeros((self.num_inducing, self.num_latent_funcs)))
        chols = choleskies.triang_to_flat(
            np.tile(
                np.eye(self.num_inducing)[None, :, :],
                (self.num_latent_funcs, 1, 1)))
        self.q_u_chols = Param('qf_u_chols', chols)

        self.link_parameter(self.Z, index=0)
        self.link_parameter(self.q_u_means)
        self.link_parameter(self.q_u_chols)
        # self.link_parameter(self.likelihood)

        #Must pass a list of kernels that work on each latent function for now
        assert len(kern_list) == self.num_latent_funcs
        #Add the rest of the kernels, one kernel per latent function
        [self.link_parameter(kern) for kern in kern_list]
        #self.latent_f_list = [self.mf, self.mg]
        #self.latent_fchol_list = [self.cholf, self.cholg]

        if mean_functions is None:
            self.mean_functions = [None] * self.num_latent_funcs
        elif len(mean_functions) != len(kern_list):
            raise ValueError("Must provide a mean function for all latent\n\
                             functions as a list, provide None if no latent\n\
                             function is needed for a specific latent function"
                             )
        else:
            self.mean_functions = []
            for m_f in mean_functions:
                if m_f is not None:
                    self.link_parameter(m_f)
                self.mean_functions.append(m_f)
Пример #7
0
    def calculate_gradients(self, log_marginal, latent_info, dF_dmu, dF_dv, num_inducing, num_outputs, num_data):
        """
        Given a named tuple for lots of parameters of the latent function, calculate the
        gradients wrt to its latent functions and kernel
        """
        l = latent_info
        #derivatives of expected likelihood, assuming zero mean function
        #Adv = l.A.T[:, :, None]*dF_dv[None, :, :] # As if dF_Dv is diagonal
        Adv = l.A[None,:,:]*dF_dv.T[:,None,:] # As if dF_Dv is diagonal, D, M, N
        #Admu = l.A.T.dot(dF_dmu)
        Admu = l.A.dot(dF_dmu)
        #AdvA = np.dstack([np.dot(l.A.T, Adv[:,:,i].T) for i in range(num_outputs)])
        Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
        AdvA = np.dot(Adv.reshape(-1, num_data),l.A.T).reshape(num_outputs, num_inducing, num_inducing )
        #tmp = linalg.ijk_jlk_to_il(AdvA, l.S).dot(l.Kmmi)
        tmp = np.sum([np.dot(a,s) for a, s in zip(AdvA, l.S)],0).dot(l.Kmmi)
        #dF_dKmm = -Admu.dot(l.Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
        dF_dKmm = -Admu.dot(l.Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
        #tmp = 2.*(linalg.ij_jlk_to_ilk(l.Kmmi, l.S) - np.eye(num_inducing)[:,:,None])
        tmp = l.S.reshape(-1, num_inducing).dot(l.Kmmi).reshape(num_outputs, num_inducing , num_inducing )
        tmp = 2.*(tmp - np.eye(num_inducing)[None, :,:])
        #dF_dKmn = linalg.ijk_jlk_to_il(tmp, Adv) + l.Kmmim.dot(dF_dmu.T)
        dF_dKmn = l.Kmmim.dot(dF_dmu.T)
        for a,b in zip(tmp, Adv):
            dF_dKmn += np.dot(a.T, b)
        dF_dm = Admu
        dF_dS = AdvA

        #gradient of the KL term (assuming zero mean function)
        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(l.L)

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")
            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
            #Si, Lnew, _,_ = linalg.pdinv(S)

        dKL_dm = l.Kmmim.copy()
        #dKL_dS = 0.5*(l.Kmmi[:,:,None] - Si)
        dKL_dS = 0.5*(l.Kmmi[None,:,:] - Si)
        #dKL_dKmm = 0.5*num_outputs*l.Kmmi - 0.5*l.Kmmi.dot(l.S.sum(-1)).dot(l.Kmmi) - 0.5*l.Kmmim.dot(l.Kmmim.T)
        dKL_dKmm = 0.5*num_outputs*l.Kmmi - 0.5*l.Kmmi.dot(l.S.sum(0)).dot(l.Kmmi) - 0.5*l.Kmmim.dot(l.Kmmim.T)

        #adjust gradient to account for mean function
        dL_dmfZ = None
        dL_dmfX = None
        KL = l.KL
        if l.mean_function is not None:
            #adjust KL term for mean function
            Kmmi_mfZ = np.dot(l.Kmmi, l.prior_mean_u)
            KL += -np.sum(l.q_u_mean*Kmmi_mfZ)
            KL += 0.5*np.sum(Kmmi_mfZ*l.prior_mean_u)

            #adjust gradient for mean fucntion
            dKL_dm -= Kmmi_mfZ
            dKL_dKmm += l.Kmmim.dot(Kmmi_mfZ.T)
            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)

            #compute gradients for mean_function
            dKL_dmfZ = Kmmi_mfZ - l.Kmmim

            dF_dmfX = dF_dmu.copy()
            dF_dmfZ = -Admu
            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
            dF_dKmm += Admu.dot(Kmmi_mfZ.T)

            dL_dmfZ = dF_dmfZ - dKL_dmfZ
            dL_dmfX = dF_dmfX

        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS - dKL_dS, dF_dKmm - dKL_dKmm, dF_dKmn

        #dL_dchol = np.dstack([2.*np.dot(dL_dS[:, :, i], l.L[: , :,i]) for i in range(num_outputs)])
        dL_dchol = 2.*np.array([np.dot(a,b) for a, b in zip(dL_dS, l.L) ])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        log_marginal -= KL

        dL_dKdiag = dF_dv.sum(1)
        return log_marginal, dL_dKmm, dL_dKmn, dL_dKdiag, dL_dm, dL_dchol, dL_dmfZ, dL_dmfX
Пример #8
0
    def elbo_derivatives(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, dims,
                         f_index, d_index, q):
        """
        Description:  Returns ELBO derivatives w.r.t. variational parameters and hyperparameters
        Equation:     gradients = {dL/dmu, dL/dS, dL/dKmm, dL/Kmn, dL/dKdiag}
        Paper:        In Appendix 4 and 5
        Extra_Info:   Gradients w.r.t. hyperparameters use chain-rule and GPy. Note that Kmm, Kmn, Kdiag are matrices
        """
        Q = dims['Q']
        M = dims['M']

        #------------------------------------#      ALGEBRA FOR DERIVATIVES       #------------------------------------#
        #######  Algebra for q(u) and p(u)  #######
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        S_u = np.empty((Q, M, M))
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]
        Kuu = p_U.Kuu.copy()
        Kuui = p_U.Kuui.copy()
        S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :]))

        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        #-------------------------------------#      DERIVATIVES OF ELBO TERMS      #----------------------------------#
        #######  KL Terms  #######
        dKL_dmu_q = np.dot(Kuui[q, :, :], m_u[:, q, None])
        dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi)
        dKL_dKqq = 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \
                   - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[:, q, None], m_u[:, q, None].T)).dot(Kuui[q, :, :].T)

        ####### Variational Expectation (VE) Terms #######
        dVE_dmu_q = np.zeros((M, 1))
        dVE_dS_q = np.zeros((M, M))
        dVE_dKqq = np.zeros((M, M))
        dVE_dKqd = []
        dVE_dKdiag = []

        for d, q_fd in enumerate(q_F):
            Nt = Ntask[f_index[d]]
            dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T,
                                VE_dm[f_index[d]][:, d_index[d]])[:, None]
            Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d],
                                                           None].T
            Adv = np.ascontiguousarray(Adv)
            AdvA = np.dot(Adv.reshape(-1, Nt),
                          q_fd.Afdu[q, :, :]).reshape(M, M)
            dVE_dS_q += AdvA

            #######  Derivatives dKuquq #######
            tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :])
            dVE_dKqq += AdvA - tmp_dv - tmp_dv.T
            Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d],
                                                                 None])
            dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[:, q, None]).T)

            #######  Derivatives dKuqfd  #######
            tmp = np.dot(S_u[q, :, :], Kuui[q, :, :])
            tmp = 2. * (tmp - np.eye(M))
            dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[:, q, None]),
                             VE_dm[f_index[d]][:, d_index[d], None].T)
            dve_kqd += np.dot(tmp.T, Adv)
            dVE_dKqd.append(dve_kqd)

            #######  Derivatives dKdiag  #######
            dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]])

        dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T)

        #--------------------------------------#      FINAL ELBO DERIVATIVES      #------------------------------------#
        #######  ELBO derivatives ---> sum of VE and KL terms  #######
        dL_dmu_q = dVE_dmu_q - dKL_dmu_q
        dL_dS_q = dVE_dS_q - dKL_dS_q
        dL_dKqq = dVE_dKqq - dKL_dKqq
        dL_dKdq = dVE_dKqd
        dL_dKdiag = dVE_dKdiag

        ####### Pass S_q gradients to its low-triangular representation L_q  #######
        chol_u = q_U.chols_u.copy()
        L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1])
        dL_dL_q = 2. * np.array(
            [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)])
        dL_dL_q = choleskies.triang_to_flat(dL_dL_q)

        return dL_dmu_q, dL_dL_q, dL_dS_q, dL_dKqq, dL_dKdq, dL_dKdiag
Пример #9
0
    def __init__(self, X, Y, Z, kern_list, likelihood, mean_functions=None, name='SVGPMulti', Y_metadata=None, batchsize=None):
        """
        Extension to the SVGP to allow multiple latent function,
        where the latent functions are assumed independant (have one kernel per latent function)
        """
        # super(SVGPMulti, self).__init__(name)  # Parameterized.__init__(self)

        assert X.ndim == 2
        self.Y_metadata = Y_metadata
        _, self.output_dim = Y.shape

        # self.Z = Param('inducing inputs', Z)
        # self.num_inducing = Z.shape[0]
        # self.likelihood = likelihood

        self.kern_list = kern_list
        self.batchsize = batchsize

        #Batch the data
        self.X_all, self.Y_all = X, Y
        if batchsize is None:
            X_batch, Y_batch = X, Y
        else:
            import climin.util
            #Make a climin slicer to make drawing minibatches much quicker
            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
            X_batch, Y_batch = self.new_batch()

        # if isinstance(X_batch, (ObsAr, VariationalPosterior)):
            # self.X = X_batch.copy()
        # else:
            # self.X = ObsAr(X_batch)
        # self.Y = Y_batch

        #create the SVI inference method
        # self.inference_method = svgp_inf()
        inference_method = svgp_inf()

        #Initialize base model
        super(SVGPMulti, self).__init__(X=X_batch, Y=Y_batch, Z=Z, kernel=kern_list[0], likelihood=likelihood, mean_function=None, X_variance=None, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=False)
        self.unlink_parameter(self.kern)  # We don't want a single kern

        # self.num_data, self.input_dim = self.X.shape
        self.num_outputs = self.Y.shape[1]

        self.num_latent_funcs = self.likelihood.request_num_latent_functions(self.Y_all)

        #Make a latent function per dimension
        self.q_u_means = Param('q_u_means', np.zeros((self.num_inducing, self.num_latent_funcs)))
        chols = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[None,:,:], (self.num_latent_funcs,1,1)))
        self.q_u_chols = Param('qf_u_chols', chols)

        self.link_parameter(self.Z, index=0)
        self.link_parameter(self.q_u_means)
        self.link_parameter(self.q_u_chols)
        # self.link_parameter(self.likelihood)

        #Must pass a list of kernels that work on each latent function for now
        assert len(kern_list) == self.num_latent_funcs
        #Add the rest of the kernels, one kernel per latent function
        [self.link_parameter(kern) for kern in kern_list]
        #self.latent_f_list = [self.mf, self.mg]
        #self.latent_fchol_list = [self.cholf, self.cholg]

        if mean_functions is None:
            self.mean_functions = [None]*self.num_latent_funcs
        elif len(mean_functions) != len(kern_list):
            raise ValueError("Must provide a mean function for all latent\n\
                             functions as a list, provide None if no latent\n\
                             function is needed for a specific latent function")
        else:
            self.mean_functions = []
            for m_f in mean_functions:
                if m_f is not None:
                    self.link_parameter(m_f)
                self.mean_functions.append(m_f)
Пример #10
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern_list,
                 likelihood,
                 Y_metadata,
                 name='HetMOGP',
                 batch_size=None):
        """
        :param X:           Input data
        :param Y:           (Heterogeneous) Output data
        :param Z:           Inducing inputs
        :param kern_list:   Kernel functions of GP priors
        :param likelihood:  (Heterogeneous) Likelihoods
        :param Y_metadata:  Linking info between F->likelihoods
        :param name:        Model name
        :param batch_size:  Size of batch for stochastic optimization

        Description: Initialization method for the model class
        """

        #---------------------------------------#     INITIALIZATIONS     #--------------------------------------------#
        #######   Initialization of class variables  #######
        self.batch_size = batch_size
        self.kern_list = kern_list
        self.likelihood = likelihood
        self.Y_metadata = Y_metadata

        #######   Heterogeneous Data  #######
        self.Xmulti = X
        self.Ymulti = Y

        #######  Batches of Data for Stochastic Mode   #######
        self.Xmulti_all, self.Ymulti_all = X, Y
        if batch_size is None:
            self.stochastic = False
            Xmulti_batch, Ymulti_batch = X, Y
        else:
            #######   Makes a climin slicer to make drawing minibatches much quicker   #######
            self.stochastic = True
            self.slicer_list = []
            [
                self.slicer_list.append(
                    draw_mini_slices(Xmulti_task.shape[0], self.batch_size))
                for Xmulti_task in self.Xmulti
            ]
            Xmulti_batch, Ymulti_batch = self.new_batch()
            self.Xmulti, self.Ymulti = Xmulti_batch, Ymulti_batch

        #######   Model dimensions {M, Q, D}  #######
        self.num_inducing = Z.shape[0]  # M
        self.num_latent_funcs = len(kern_list)  # Q
        self.num_output_funcs = likelihood.num_output_functions(
            self.Y_metadata)

        ####### Inducing points Z #######
        self.Xdim = Z.shape[1]
        Z = np.tile(Z, (1, self.num_latent_funcs))

        #######   Inference   #######
        inference_method = Inference()

        #######  Model class (and inherited classes) super-initialization  #######
        super(HetMOGP, self).__init__(X=Xmulti_batch[0][1:10],
                                      Y=Ymulti_batch[0][1:10],
                                      Z=Z,
                                      kernel=kern_list[0],
                                      likelihood=likelihood,
                                      mean_function=None,
                                      X_variance=None,
                                      inference_method=inference_method,
                                      Y_metadata=Y_metadata,
                                      name=name,
                                      normalizer=False)

        #######  Initialization of the Multi-output GP mixing  #######
        self.W_list, self.kappa_list = multi_output.random_W_kappas(
            self.num_latent_funcs, self.num_output_funcs, rank=1)
        _, self.B_list = multi_output.LCM(input_dim=self.Xdim,
                                          output_dim=self.num_output_funcs,
                                          rank=1,
                                          kernels_list=self.kern_list,
                                          W_list=self.W_list,
                                          kappa_list=self.kappa_list)

        ####### Initialization of Variational Parameters (q_u_means = \mu, q_u_chols = lower_triang(S))  #######
        self.q_u_means = Param(
            'm_u',
            0 * np.random.randn(self.num_inducing, self.num_latent_funcs) +
            0 * np.tile(np.random.randn(1, self.num_latent_funcs),
                        (self.num_inducing, 1)))
        chols = choleskies.triang_to_flat(
            np.tile(
                np.eye(self.num_inducing)[None, :, :],
                (self.num_latent_funcs, 1, 1)))
        self.q_u_chols = Param('L_u', chols)

        #-----------------------------#   LINKS FOR OPTIMIZABLE PARAMETERS     #---------------------------------------#

        ####### Linking and Un-linking of parameters and hyperaparameters (for ParamZ optimizer)  #######
        self.unlink_parameter(
            self.kern)  # Unlink SparseGP default param kernel
        self.link_parameter(self.Z, index=0)
        self.link_parameter(self.q_u_means)
        self.link_parameters(self.q_u_chols)
        [self.link_parameter(kern_q)
         for kern_q in kern_list]  # link all kernels
        [self.link_parameter(B_q) for B_q in self.B_list]

        ####### EXTRA. Auxiliary variables  #######
        self.vem_step = True  # [True=VE-step, False=VM-step]
        self.ve_count = 0
        self.elbo = np.zeros((1, 1))
Пример #11
0
    def __init__(self,
                 X,
                 Y,
                 Z,
                 kern_list,
                 likelihood,
                 Y_metadata,
                 name='SVMOGP',
                 batch_size=None,
                 non_chained=True):

        self.batch_size = batch_size
        self.kern_list = kern_list
        self.likelihood = likelihood
        self.Y_metadata = Y_metadata

        self.num_inducing = Z.shape[0]  # M
        self.num_latent_funcs = len(kern_list)  # Q
        self.num_output_funcs = likelihood.num_output_functions(Y_metadata)

        if (not non_chained):
            assert self.num_output_funcs == self.num_latent_funcs, "we need a latent function per likelihood parameter"

        if non_chained:
            self.W_list, self.kappa_list = util.random_W_kappas(
                self.num_latent_funcs, self.num_output_funcs, rank=1)
        else:
            self.W_list, self.kappa_list = util.Chained_W_kappas(
                self.num_latent_funcs, self.num_output_funcs, rank=1)

        self.Xmulti = X
        self.Ymulti = Y
        self.iAnnMulti = Y_metadata['iAnn']

        # Batch the data
        self.Xmulti_all, self.Ymulti_all, self.iAnn_all = X, Y, Y_metadata[
            'iAnn']
        if batch_size is None:
            #self.stochastic = False
            Xmulti_batch, Ymulti_batch, iAnnmulti_batch = X, Y, Y_metadata[
                'iAnn']
        else:
            # Makes a climin slicer to make drawing minibatches much quicker
            #self.stochastic = False   #"This was True as Pablo had it"
            self.slicer_list = []
            [
                self.slicer_list.append(
                    draw_mini_slices(Xmulti_task.shape[0], self.batch_size))
                for Xmulti_task in self.Xmulti
            ]
            Xmulti_batch, Ymulti_batch, iAnnmulti_batch = self.new_batch()
            self.Xmulti, self.Ymulti, self.iAnnMulti = Xmulti_batch, Ymulti_batch, iAnnmulti_batch
            self.Y_metadata.update(iAnn=iAnnmulti_batch)

        # Initialize inducing points Z
        #Z = kmm_init(self.X_all, self.num_inducing)
        self.Xdim = Z.shape[1]
        Z = np.tile(Z, (1, self.num_latent_funcs))

        inference_method = SVMOGPInf()

        super(SVMOGP, self).__init__(X=Xmulti_batch[0][1:10],
                                     Y=Ymulti_batch[0][1:10],
                                     Z=Z,
                                     kernel=kern_list[0],
                                     likelihood=likelihood,
                                     mean_function=None,
                                     X_variance=None,
                                     inference_method=inference_method,
                                     Y_metadata=Y_metadata,
                                     name=name,
                                     normalizer=False)

        self.unlink_parameter(
            self.kern)  # Unlink SparseGP default param kernel

        _, self.B_list = util.LCM(input_dim=self.Xdim,
                                  output_dim=self.num_output_funcs,
                                  rank=1,
                                  kernels_list=self.kern_list,
                                  W_list=self.W_list,
                                  kappa_list=self.kappa_list)

        # Set-up optimization parameters: [Z, m_u, L_u]
        self.q_u_means = Param(
            'm_u',
            0.0 * np.random.randn(self.num_inducing, self.num_latent_funcs) +
            0.0 * np.tile(np.random.randn(1, self.num_latent_funcs),
                          (self.num_inducing, 1)))
        chols = choleskies.triang_to_flat(
            np.tile(
                np.eye(self.num_inducing)[None, :, :],
                (self.num_latent_funcs, 1, 1)))
        self.q_u_chols = Param('L_u', chols)

        self.link_parameter(self.Z, index=0)
        self.link_parameter(self.q_u_means)
        self.link_parameters(self.q_u_chols)
        [self.link_parameter(kern_q)
         for kern_q in kern_list]  # link all kernels
        [self.link_parameter(B_q) for B_q in self.B_list]

        self.vem_step = True  # [True=VE-step, False=VM-step]
        self.ve_count = 0
        self.elbo = np.zeros((1, 1))
        self.index_VEM = 0  #this is a variable to index correctly the self.elbo when using VEM
        self.Gauss_Newton = False  #This is a flag for using the Gauss-Newton approximation when dL_dV is needed
Пример #12
0
    def calculate_gradients(self, q_U, p_U_new, p_U_old, p_U_var, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index,q):
        """
        Calculates gradients of the Log-marginal distribution p(Y) wrt variational
        parameters mu_q, S_q
        """
        # Algebra for q(u):
        m_u = q_U.mu_u.copy()
        L_u = choleskies.flat_to_triang(q_U.chols_u.copy())
        S_u = np.empty((Q, M, M))
        [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)]

        S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :]))
        if np.any(np.isinf(S_qi)):
            raise ValueError("Sqi: Cholesky representation unstable")

        # Algebra for p(u)
        Kuu_new = p_U_new.Kuu.copy()
        Luu_new = p_U_new.Luu.copy()
        Kuui_new = p_U_new.Kuui.copy()

        Kuu_old = p_U_old.Kuu.copy()
        Luu_old = p_U_old.Luu.copy()
        Kuui_old = p_U_old.Kuui.copy()

        Mu_var = p_U_var.Mu.copy()
        Kuu_var = p_U_var.Kuu.copy()
        Luu_var = p_U_var.Luu.copy()
        Kuui_var = p_U_var.Kuui.copy()


        # KL Terms
        dKLnew_dmu_q = np.dot(Kuui_new[q,:,:], m_u[:, q, None])
        dKLnew_dS_q = 0.5 * (Kuui_new[q,:,:] - S_qi)

        dKLold_dmu_q = np.dot(Kuui_old[q,:,:], m_u[:, q, None])
        dKLold_dS_q = 0.5 * (Kuui_old[q,:,:] - S_qi)

        dKLvar_dmu_q = np.dot(Kuui_var[q,:,:], (m_u[:, q, None] - Mu_var[q, :, :])) # important!! (Eq. 69 MCB)
        dKLvar_dS_q = 0.5 * (Kuui_var[q,:,:] - S_qi)

        dKLnew_dKqq = 0.5 * Kuui_new[q,:,:] - 0.5 * Kuui_new[q,:,:].dot(S_u[q, :, :]).dot(Kuui_new[q,:,:]) \
                   - 0.5 * np.dot(Kuui_new[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_new[q,:,:].T)

        dKLold_dKqq = 0.5 * Kuui_old[q,:,:] - 0.5 * Kuui_old[q,:,:].dot(S_u[q, :, :]).dot(Kuui_old[q,:,:]) \
                   - 0.5 * np.dot(Kuui_old[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_old[q,:,:].T)

        #dKLvar_dKqq = 0.5 * Kuui_var[q,:,:] - 0.5 * Kuui_var[q,:,:].dot(S_u[q, :, :]).dot(Kuui_var[q,:,:]) \
        #           - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_var[q,:,:].T) \
        #            + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(m_u[:,q,None], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) \
        #            + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(Mu_var[q,:,:], m_u[:,q,None].T)).dot(Kuui_var[q,:,:].T) \
        #              - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(Mu_var[q,:,:], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T)


        #KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \
        #             + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \
        #             - 0.5 * M \
        #             + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \
        #             - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :]))))

        #

        # VE Terms
        dVE_dmu_q = np.zeros((M, 1))
        dVE_dS_q = np.zeros((M, M))
        dVE_dKqq = np.zeros((M, M))
        dVE_dKqd = []
        dVE_dKdiag = []

        for d, q_fd in enumerate(q_F):
            Nt = Ntask[f_index[d]]
            dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d]])[:, None]
            Adv = q_fd.Afdu[q,:,:].T * VE_dv[f_index[d]][:,d_index[d],None].T
            Adv = np.ascontiguousarray(Adv)
            AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M)
            dVE_dS_q += AdvA

            # Derivatives dKuquq
            tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui_new[q,:,:])
            dVE_dKqq += AdvA - tmp_dv - tmp_dv.T
            Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d],None])
            dVE_dKqq += - np.dot(Adm, np.dot(Kuui_new[q,:,:], m_u[:, q, None]).T)

            # Derivatives dKuqfd
            tmp = np.dot(S_u[q, :, :], Kuui_new[q,:,:])
            tmp = 2. * (tmp - np.eye(M))
            dve_kqd = np.dot(np.dot(Kuui_new[q,:,:], m_u[:, q, None]), VE_dm[f_index[d]][:,d_index[d],None].T)
            dve_kqd += np.dot(tmp.T, Adv)
            dVE_dKqd.append(dve_kqd)

            # Derivatives dKdiag
            dVE_dKdiag.append(VE_dv[f_index[d]][:,d_index[d]])

        dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T)

        # Derivatives of variational parameters
        dL_dmu_q = dVE_dmu_q - dKLnew_dmu_q + dKLold_dmu_q - dKLvar_dmu_q
        dL_dS_q = dVE_dS_q - dKLnew_dS_q + dKLold_dS_q - dKLvar_dS_q

        # Derivatives of prior hyperparameters
        # if using Zgrad, dL_dKqq = dVE_dKqq - dKLnew_dKqq + dKLold_dKqq - dKLvar_dKqq
        # otherwise for hyperparameters: dL_dKqq = dVE_dKqq - dKLnew_dKqq
        dL_dKqq = dVE_dKqq - dKLnew_dKqq #+ dKLold_dKqq - dKLvar_dKqq # dKLold_dKqq sólo para Zgrad, dKLvar_dKqq to be done (for Zgrad)
        dL_dKdq = dVE_dKqd
        dL_dKdiag = dVE_dKdiag

        # Pass S_q gradients to its low-triangular representation L_q
        chol_u = q_U.chols_u.copy()
        L_q = choleskies.flat_to_triang(chol_u[:,q:q+1])
        dL_dL_q = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS_q[None,:,:], L_q)])
        dL_dL_q = choleskies.triang_to_flat(dL_dL_q)

        # Posterior
        posterior_q = Posterior(mean=m_u[:, q, None], cov=S_u[q, :, :], K=Kuu_new[q,:,:], prior_mean=np.zeros(m_u[:, q, None].shape))

        return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag