예제 #1
0
def infLaplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ Laplace approximation to the posterior Gaussian process.
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (see likelihoods.py).
    """

    tol = 1e-6
    # tolerance for when to stop the Newton iterations
    smax = 2
    Nline = 20
    thr = 1e-4
    # line search parameters
    maxit = 20
    # max number of Newton steps in f

    inffunc = "inferences.infLaplace"

    K = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    n, D = x.shape

    Psi_old = np.inf  # make sure while loop starts by the largest old objective val
    if "last_alpha" not in infLaplace.__dict__:  # find a good starting point for alpha and f
        alpha = np.zeros((n, 1))
        f = np.dot(K, alpha) + m  # start at mean if sizes not match
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = -lp.sum()
    else:
        alpha = last_alpha
        f = np.dot(K, alpha) + m  # try last one
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum()  # objective for last alpha
        vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1)
        Psi_def = vargout[0]  # objective for default init f==m
        if Psi_def < Psi_new:  # if default is better, we use it
            alpha = np.zeros((n, 1))
            f = np.dot(K, alpha) + m
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
            lp = vargout[0]
            dlp = vargout[1]
            d2lp = vargout[2]
            W = -d2lp
            Psi_new = -lp.sum()

    isWneg = np.any(W < 0)  # flag indicating whether we found negative values of W
    it = 0  # this happens for the Student's t likelihood

    while (Psi_old - Psi_new > tol) and it < maxit:  # begin Newton
        Psi_old = Psi_new
        it += 1
        if isWneg:  # stabilise the Newton direction in case W has negative values
            W = np.maximum(W, 0)  # stabilise the Hessian to guarantee postive definiteness
            tol = 1e-10
            # increase accuracy to also get the derivatives right
            # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use
            # a more conservative strategy then we do being equivalent to 2 lines below.
            # nu  = exp(hyp.lik(1));                    # degree of freedom hyperparameter
            # W  = W + 2/(nu+1)*dlp.^2;                 # add ridge according to Vanhatalo

        sW = np.sqrt(W)
        L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T
        b = W * (f - m) + dlp
        dalpha = b - sW * solve_chol(L, sW * np.dot(K, b)) - alpha
        vargout = brentmin(0, smax, Nline, thr, _Psi_line, 4, dalpha, alpha, hyp, K, m, likfunc, y, inffunc)
        s = vargout[0]
        Psi_new = vargout[1]
        Nfun = vargout[2]
        alpha = vargout[3]
        f = vargout[4]
        dlp = vargout[5]
        W = vargout[6]
        isWneg = np.any(W < 0)

    last_alpha = alpha  # remember for next call
    vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4)
    lp = vargout[0]
    dlp = vargout[1]
    d2lp = vargout[2]
    d3lp = vargout[3]

    W = -d2lp
    isWneg = np.any(W < 0)
    post = postStruct()
    post.alpha = alpha  # return the posterior parameters
    post.sW = np.sqrt(np.abs(W)) * np.sign(W)  # preserve sign in case of negative
    if isWneg:
        [ldA, iA, post.L] = _logdetA(K, W, 3)
        nlZ = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum() + ldA / 2.0
        nlZ = nlZ[0]
    else:
        sW = post.sW
        post.L = np.linalg.cholesky(np.eye(n) + np.dot(sW, sW.T) * K).T
        nlZ = np.dot(alpha.T, (f - m)) / 2.0 + (np.log(np.diag(post.L)) - np.reshape(lp, (lp.shape[0],))).sum()
        nlZ = nlZ[0]

    if nargout > 2:  # do we want derivatives?
        dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
        if isWneg:  # switch between Cholesky and LU decomposition mode
            Z = -post.L  # inv(K+inv(W))
            g = np.atleast_2d((iA * K).sum(axis=1)).T / 2  # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2
        else:
            Z = np.tile(sW, (1, n)) * solve_chol(
                post.L, np.diag(np.reshape(sW, (sW.shape[0],)))
            )  # sW*inv(B)*sW=inv(K+inv(W))
            C = np.linalg.solve(post.L.T, np.tile(sW, (1, n)) * K)  # deriv. of ln|B| wrt W
            g = np.atleast_2d((np.diag(K) - (C ** 2).sum(axis=0).T)).T / 2.0  # g = diag(inv(inv(K)+W))/2

        dfhat = g * d3lp  # deriv. of nlZ wrt. fhat
        for ii in range(len(hyp.cov)):  # covariance hypers
            dK = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)
            dnlZ.cov[ii] = (Z * dK).sum() / 2.0 - np.dot(alpha.T, np.dot(dK, alpha)) / 2.0  # explicit part
            b = np.dot(dK, dlp)
            tmp = np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))
            dnlZ.cov[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0]  # implicit part

        for ii in range(len(hyp.lik)):  # likelihood hypers
            [lp_dhyp, dlp_dhyp, d2lp_dhyp] = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3)
            dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum()  # explicit part
            b = np.dot(K, dlp_dhyp)
            dnlZ.lik[ii] -= np.dot(dfhat.T, b - np.dot(K, np.dot(Z, b)))[0, 0]  # implicit part

        for ii in range(len(hyp.mean)):  # mean hypers
            dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
            dnlZ.mean[ii] = -np.dot(alpha.T, dm)  # explicit part
            dnlZ.mean[ii] -= np.dot(dfhat.T, dm - np.dot(K, np.dot(Z, dm)))[0, 0]  # implicit part

        vargout = [post, nlZ, dnlZ]
    else:
        vargout = [post, nlZ]

    return vargout
예제 #2
0
def infFITC_Laplace(hyp, meanfunc, covfunc, likfunc, x, y, nargout=1):
    """ infFITC_Laplace - FITC-Laplace approximation to the posterior Gaussian process. The function is
     equivalent to infLaplace with the covariance function:
    
       Kt = Q + G; G = diag(g); g = diag(K-Q);  Q = Ku' * inv(Kuu + snu2 * eye(nu)) * Ku
    
     where Ku and Kuu are covariances w.r.t. to inducing inputs xu and
     snu2 = sn2/1e6 is the noise of the inducing inputs. We fixed the standard
     deviation of the inducing inputs snu to be a one per mil of the measurement 
     noise's standard deviation sn. In case of a likelihood without noise
     parameter sn2, we simply use snu2 = 1e-6.
    
     The implementation exploits the Woodbury matrix identity
     inv(Kt) = inv(G) - inv(G) * Ku' * inv(Kuu+Ku * inv(G) * Ku') * Ku * inv(G)
     in order to be applicable to large datasets. The computational complexity
     is O(n nu^2) where n is the number of data points x and nu the number of
     inducing inputs in xu.
     The posterior N(f|h,Sigma) is given by h = m+mu with mu = nn + P' * gg and
     Sigma = inv(inv(K)+diag(W)) = diag(d) + P' * R0' * R' * R * R0 * P.
                 
     The function takes a specified covariance function (see kernels.py) and
     likelihood function (likelihoods.py), and is designed to be used with
     gp.py and in conjunction with covFITC. 
    """

    cov1 = covfunc[0]
    if not cov1 == ["kernels.covFITC"]:
        raise Exception("Only covFITC supported.")  # check cov

    tol = 1e-6
    # tolerance for when to stop the Newton iterations
    smax = 2
    Nline = 100
    thr = 1e-4
    # line search parameters
    maxit = 20
    # max number of Newton steps in f

    inffunc = "inferences.infLaplace"

    diagK, Kuu, Ku = src.Tools.general.feval(covfunc, hyp.cov, x)  # evaluate the covariance
    m = src.Tools.general.feval(meanfunc, hyp.mean, x)  # evaluate the mean vector

    if hyp.lik:  # hard coded inducing inputs noise
        sn2 = np.exp(2.0 * hyp.lik[-1])
        snu2 = 1.0e-6 * sn2  # similar to infFITC
    else:
        snu2 = 1.0e-6

    n, D = x.shape
    nu = Kuu.shape[0]

    rot180 = lambda A: np.rot90(np.rot90(A))  # little helper funct
    chol_inv = lambda A: np.linalg.solve(rot180(np.linalg.cholesky(rot180(A))), np.eye(nu))  # chol(inv(A))

    R0 = chol_inv(Kuu + snu2 * np.eye(nu))  # initial R, used for refresh O(nu^3)
    V = np.dot(R0, Ku)
    d0 = diagK - np.array([(V * V).sum(axis=0)]).T  # initial d, needed

    Psi_old = np.inf  # make sure while loop starts by the largest old objective val
    if "last_alpha" not in infFITC_Laplace.__dict__:  # find a good starting point for alpha and f
        alpha = np.zeros((n, 1))
        f = _mvmK(alpha, V, d0) + m  # start at mean if sizes not match
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = -lp.sum()
    else:
        alpha = last_alpha
        f = _mvmK(alpha, V, d0) + m  # try last one
        vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
        lp = vargout[0]
        dlp = vargout[1]
        d2lp = vargout[2]
        W = -d2lp
        Psi_new = np.dot(alpha.T, (f - m)) / 2.0 - lp.sum()  # objective for last alpha
        vargout = -src.Tools.general.feval(likfunc, hyp.lik, y, m, None, inffunc, None, 1)
        Psi_def = vargout[0]  # objective for default init f==m
        if Psi_def < Psi_new:  # if default is better, we use it
            alpha = np.zeros((n, 1))
            f = _mvmK(alpha, V, d0) + m
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 3)
            lp = vargout[0]
            dlp = vargout[1]
            d2lp = vargout[2]
            W = -d2lp
            Psi_new = -lp.sum()

    isWneg = np.any(W < 0)  # flag indicating whether we found negative values of W
    it = 0  # this happens for the Student's t likelihood

    while (Psi_old - Psi_new > tol) and it < maxit:  # begin Newton
        Psi_old = Psi_new
        it += 1
        if isWneg:  # stabilise the Newton direction in case W has negative values
            W = np.maximum(W, 0)  # stabilise the Hessian to guarantee postive definiteness
            tol = 1e-8
            # increase accuracy to also get the derivatives right
            # In Vanhatalo et. al., GPR with Student's t likelihood, NIPS 2009, they use
            # a more conservative strategy then we do being equivalent to 2 lines below.
            # nu  = exp(hyp.lik(1));                  # degree of freedom hyperparameter
            # W  = W + 2/(nu+1)*dlp.^2;               # add ridge according to Vanhatalo

        b = W * (f - m) + dlp
        dd = 1 / (1 + W * d0)
        RV = np.dot(chol_inv(np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T)), V)
        dalpha = dd * b - (W * dd) * np.dot(RV.T, np.dot(RV, (dd * b))) - alpha  # Newt dir + line search
        vargout = brentmin(0, smax, Nline, thr, _Psi_lineFITC, 4, dalpha, alpha, hyp, V, d0, m, likfunc, y, inffunc)
        s = vargout[0]
        Psi_new = vargout[1]
        Nfun = vargout[2]
        alpha = vargout[3]
        f = vargout[4]
        dlp = vargout[5]
        W = vargout[6]

        isWneg = np.any(W < 0)

    last_alpha = alpha  # remember for next call
    vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, None, 4)
    lp = vargout[0]
    dlp = vargout[1]
    d2lp = vargout[2]
    d3lp = vargout[3]

    W = -d2lp
    isWneg = np.any(W < 0)
    post = postStruct()
    post.alpha = np.dot(R0.T, np.dot(V, alpha))  # return the posterior parameters
    post.sW = np.sqrt(np.abs(W)) * np.sign(W)  # preserve sign in case of negative
    dd = 1 / (1 + d0 * W)  # temporary variable O(n)
    A = np.eye(nu) + np.dot(V * np.tile((W * dd).T, (nu, 1)), V.T)  # temporary variable O(n*nu^2)
    R0tV = np.dot(R0.T, V)
    B = R0tV * np.tile((W * dd).T, (nu, 1))  # temporary variables O(n*nu^2)
    post.L = -np.dot(B, R0tV.T)  # L = -R0'*V*inv(Kt+diag(1./ttau))*V'*R0, first part
    if np.any(1 + d0 * W < 0):
        # B = np.dot(B,V.T); post.L += bp.dot(np.dot(B,np.inv(A)),B.T)
        # nlZ = np.nan; dnlZ = struct('cov',0*hyp.cov, 'mean',0*hyp.mean, 'lik',0*hyp.lik);
        raise Exception("W is too negative; nlZ and dnlZ cannot be computed.")

    nlZ = (
        np.dot(alpha.T, (f - m)) / 2.0
        - lp.sum()
        - np.log(dd).sum() / 2.0
        + np.log(np.diag(np.linalg.cholesky(A).T)).sum()
    )
    RV = np.dot(chol_inv(A), V)
    RVdd = RV * np.tile((W * dd).T, (nu, 1))  # RVdd needed for dnlZ
    B = np.dot(B, RV.T)
    post.L += np.dot(B, B.T)

    if nargout > 2:  # do we want derivatives?
        dnlZ = dnlzStruct(hyp)  # allocate space for derivatives
        [d, P, R] = _fitcRefresh(d0, Ku, R0, V, W)  # g = diag(inv(inv(K)+W))/2
        g = d / 2 + 0.5 * np.atleast_2d((np.dot(np.dot(R, R0), P) ** 2).sum(axis=0)).T
        t = W / (1 + W * d0)

        dfhat = g * d3lp  # deriv. of nlZ wrt. fhat: dfhat=diag(inv(inv(K)+W)).*d3lp/2
        for ii in range(len(hyp.cov)):  # covariance hypers
            ddiagK, dKuu, dKu = src.Tools.general.feval(covfunc, hyp.cov, x, None, ii)  # eval cov derivatives
            dA = 2.0 * dKu.T - np.dot(R0tV.T, dKuu)  # dQ = dA*R0tV
            w = np.atleast_2d((dA * R0tV.T).sum(axis=1)).T
            v = ddiagK - w  # w = diag(dQ); v = diag(dK)-diag(dQ);
            dnlZ.cov[ii] = np.dot(ddiagK.T, t) - np.dot((RVdd * RVdd).sum(axis=0), v)  # explicit part
            dnlZ.cov[ii] -= (np.dot(RVdd, dA) * np.dot(RVdd, R0tV.T)).sum()  # explicit part
            dnlZ.cov[ii] = (
                0.5 * dnlZ.cov[ii] - np.dot(alpha.T, np.dot(dA, np.dot(R0tV, alpha)) + v * alpha) / 2.0
            )  # explicit
            b = np.dot(dA, np.dot(R0tV, dlp)) + v * dlp  # b-K*(Z*b) = inv(eye(n)+K*diag(W))*b
            KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0)
            dnlZ.cov[ii] -= np.dot(dfhat.T, (b - KZb))  # implicit part

        for ii in range(len(hyp.lik)):  # likelihood hypers
            vargout = src.Tools.general.feval(likfunc, hyp.lik, y, f, None, inffunc, ii, 3)
            lp_dhyp = vargout[0]
            dlp_dhyp = vargout[1]
            d2lp_dhyp = vargout[2]
            dnlZ.lik[ii] = -np.dot(g.T, d2lp_dhyp) - lp_dhyp.sum()  # explicit part
            b = _mvmK(dlp_dhyp, V, d0)  # implicit part
            dnlZ.lik[ii] -= np.dot(dfhat.T, b - _mvmK(_mvmZ(b, RVdd, t), V, d0))
            if ii == len(hyp.lik) - 1:
                # since snu2 is a fixed fraction of sn2, there is a covariance-like term in the derivative as well
                snu = np.sqrt(snu2)
                T = chol_inv(Kuu + snu2 * np.eye(nu))
                T = np.dot(T.T, np.dot(T, snu * Ku))
                t = np.array([(T * T).sum(axis=0)]).T
                z = np.dot(alpha.T, np.dot(T.T, np.dot(T, alpha)) - t * alpha) - np.dot(
                    np.array([(RVdd * RVdd).sum(axis=0)]), t
                )
                z += (np.dot(RVdd, T.T) ** 2).sum()
                b = (t * dlp - np.dot(T.T, np.dot(T, dlp))) / 2.0
                KZb = _mvmK(_mvmZ(b, RVdd, t), V, d0)
                z -= np.dot(dfhat.T, b - KZb)
                dnlZ.lik[ii] += z

        for ii in range(len(hyp.mean)):  # mean hypers
            dm = src.Tools.general.feval(meanfunc, hyp.mean, x, ii)
            dnlZ.mean[ii] = -np.dot(alpha.T, dm)  # explicit part
            Zdm = _mvmZ(dm, RVdd, t)
            dnlZ.mean[ii] -= np.dot(dfhat.T, (dm - _mvmK(Zdm, V, d0)))  # implicit part

        vargout = [post, nlZ[0, 0], dnlZ]
    else:
        vargout = [post, nlZ[0, 0]]

    return vargout