Пример #1
0
def var_bound(data, model, query, topicDistOverride=None):
    '''
    Determines the variational bounds.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype

    tops = topicDistOverride \
        if topicDistOverride is not None \
        else topicDists(query)

    # Initialize z matrix if necessary
    W = data.words
    D, T = W.shape

    wordLikely = sparseScalarProductOfSafeLnDot(data.words, tops, wordDists(model)).sum()
    topicLikely = topicMeans.dot(fns.digamma(corpusTopicDist) - fns.digamma(corpusTopicDist.sum()))


    # Expected joint
    like = W.dot(safe_log(wordDists).T) # D*K
    like += corpusTopicDist[np.newaxis,:]
    like *= safe_log(topicMeans)

    # Entropy
    ent = (-topicMeans * safe_log(topicMeans)).sum()

    return like.sum() + ent
Пример #2
0
def var_bound(data, model, query):
    '''
    A total nonsense in this case which we retain just so all the other functions
    continue to work.
    '''
    # Unpack the the structs, for ease of access and efficiency
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Initialize z matrix if necessary
    W = data.words
    D, T = W.shape

    #  ln p(x,z) >= sum_k p(z=k|x) * (ln p(x|z=k, phi) + p(z=k)) + H[q]

    # Expected joint
    like = W.dot(safe_log(wordDists).T)  # D*K
    like *= safe_log(topicMeans)

    # Entropy
    ent = (-topicMeans * safe_log(topicMeans)).sum()

    return like.sum() + ent
Пример #3
0
def _sparseScalarProductOfSafeLnDot_py(A, B, C, out=None):
    '''
    Calculates A * B.dot(C) where A is a sparse matrix
    
    Retains sparsity in the result, unlike the built-in operator
    
    Note the type of the return-value is the same as the type of
    the sparse matrix A. If this has an integral type, this will
    only provide integer-based multiplication.
    '''
    if WarnIfSlow:
        sys.stderr.write(
            "WARNING: Slow code path triggered (_sparseScalarProductOfSafeLnDot_py)"
        )

    if not (A.dtype == B.dtype and B.dtype == C.dtype and
            (out is None or C.dtype == out.dtype)):
        raise ValueError(
            "Inconsistent dtypes in the three matrices and possibly the out-param"
        )

    if out is None:
        out = A.copy()
    else:
        out.data[:] = A.data

    rhs = B.dot(C)
    rhs[rhs < sys.float_info.min] = sys.float_info.min
    out.data *= safe_log(rhs)[csr_indices(out.indptr, out.indices)]

    return out
Пример #4
0
def log_likelihood_point(data: DataSet, model: ModelState, query: QueryState = None) -> float:
    wordDist = wordDists((model))

    # ln p(x|topic=k, word_dist) + ln p(topic=k) for all documents, for all k
    lls = data.words @ np.log(wordDist.T)
    if query is not None:
        topicDist = topicDists(query)
        lls += safe_log(topicDist)
    else:
        lls += safe_log(corpusTopicDist(model))[np.newaxis, :]

    # Safe Log-sum-exp (of topic-specific log likelihoods)
    max_lls = lls.max(axis=1)
    lls -= max_lls[:, np.newaxis]
    np.exp(lls, out=lls)

    lls = max_lls + np.log(lls.sum(axis=1))

    # Return corpus-total log likelihood
    return lls.sum()
Пример #5
0
def iterate (iterations, D, K, T, \
             W_list, docLens, \
             topicPrior, vocabPrior, \
             z_dnk, topicDists, wordDists):

    raise ValueError("This implementation no longer supported")
    totalItrs = 0
    epsilon = 0.01 / K
    oldWordDists = np.empty(wordDists.shape, wordDists.dtype)
    newWordDists = wordDists

    for _ in range(iterations):
        oldWordDists, newWordDists = newWordDists, oldWordDists
        lnWordDists = safe_log(oldWordDists, out=oldWordDists)
        newWordDists.fill(vocabPrior)

        for d in range(D):
            oldTopics = topicDists[d, :].copy()
            topicDists[d, :] = 1. / K
            lnWordProbs = lnWordDists[:, W_list[d, 0:docLens[d]]]

            innerItrs = 0
            while ((innerItrs < MaxInnerItrs) or (np.sum(np.abs(oldTopics - topicDists[d,:])) > epsilon)) \
            and (innerItrs < MaxInnerItrs):
                diTopic = fns.digamma(topicDists[d, :])
                z_dnk[:docLens[d], :] = lnWordProbs.T + diTopic[np.newaxis, :]

                # We've been working in log-space till now, before we go to true
                # probability space rescale so we don't underflow everywhere
                maxes = z_dnk.max(axis=1)
                z_dnk -= maxes[:, np.newaxis]
                np.exp(z_dnk, out=z_dnk)

                # Now normalize so probabilities sum to one
                sums = z_dnk.sum(axis=1)
                z_dnk /= sums[:, np.
                              newaxis]  # Update vocabulary: hard to do with a list representation

                # Now use it to infer the topic distribution
                topicDists[d, :] = topicPrior + np.sum(z_dnk[:docLens[d], :],
                                                       axis=0)
                topicDists[d, :] /= np.sum(topicDists[d, :])

                innerItrs += 1

            totalItrs += innerItrs
            for k in range(K):
                for n in range(docLens[d]):
                    newWordDists[k, W_list[d, n]] += z_dnk[n, k]
            newWordDists /= newWordDists.sum(axis=1)[:, np.newaxis]

    return totalItrs
Пример #6
0
def sample_memberships(W, alpha, wordDists, memberships):
    _, K = memberships.shape

    priorNum = memberships.sum(axis=0) + alpha - 1
    prior = priorNum.copy()
    sample_dists = W.dot(safe_log(wordDists).T)  # d x k

    for d in range(W.shape[0]):
        priorNum -= memberships[d, :]
        prior[:] = priorNum
        prior /= priorNum.sum()

        sample_dists[d, :] += safe_log(prior)
        sample_dists[d, :] -= sample_dists[d, :].max()
        sample_dists[d, :] -= fns.logsumexp(sample_dists[d, :])

        np.exp(sample_dists[d, :], out=sample_dists[d, :])
        memberships[d, :] = rd.multinomial(1, sample_dists[d, :], size=1)

        priorNum += memberships[d, :]

    return memberships
Пример #7
0
def var_bound(data, modelState, queryState, z_dnk=None):
    '''
    Determines the variational bounds.
    '''
    # Unpack the the structs, for ease of access and efficiency
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, _, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W = data.words
    D, T = W.shape
    maxN = docLens.max()
    if z_dnk == None:
        z_dnk = np.empty(shape=(maxN, K), dtype=dtype)

    wordDistsMatrix = wordDists(modelState)

    diWordDists = fns.digamma(wordDistsMatrix.copy()) - fns.digamma(
        wordDistsMatrix.sum(axis=1))[:, np.newaxis]
    lnWordDists = np.log(wordDistsMatrix)

    bound = 0

    # Expected Probablity
    #

    # P(topics|topicPrior)
    diTopicDists = fns.digamma(topicDists) - fns.digamma(
        topicDists.sum(axis=1))[:, np.newaxis]
    ln_b_topic = fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()
    bound += D * ln_b_topic \
           + np.sum((topicPrior - 1) * diTopicDists)

    # and its entropy
    ent = fns.gammaln(topicDists.sum(axis=1)).sum() - fns.gammaln(topicDists).sum() \
        + np.sum ((topicDists - 1) * diTopicDists)

    bound -= ent

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    # NOTE COPY AND PASTED FROM iterate_f32  / iterate_f64 (-ish)
    for d in range(D):
        lnWordProbs = lnWordDists[:, W_list[d, 0:docLens[d]]]
        diTopic = fns.digamma(topicDists[d, :])
        z_dnk[0:docLens[d], :] = lnWordProbs.T + diTopic[np.newaxis, :]

        # We've been working in log-space till now, before we go to true
        # probability space rescale so we don't underflow everywhere
        maxes = z_dnk.max(axis=1)
        z_dnk -= maxes[:, np.newaxis]
        np.exp(z_dnk, out=z_dnk)

        # Now normalize so probabilities sum to one
        sums = z_dnk.sum(axis=1)
        z_dnk /= sums[:, np.newaxis]
        #        z_dnk[docLens[d]:maxN,:] = 0 # zero probablities for words that don't exist

        # Now use to calculate  E[ln p(Z|topics), E[ln p(W|Z) and H[Z] in that order
        diTopic -= fns.digamma(np.sum(topicDists[d, :]))
        bound += np.sum(z_dnk * diTopic[np.newaxis, :])
        bound += np.sum(z_dnk[0:docLens[d], :].T *
                        diWordDists[:, W_list[d, 0:docLens[d]]])
        bound -= np.sum(z_dnk[0:docLens[d], :] *
                        safe_log(z_dnk[0:docLens[d], :]))

    # p(vocabDists|vocabPrior)

    ln_b_vocab = fns.gammaln(T * vocabPrior) - T * fns.gammaln(vocabPrior)
    bound += K * ln_b_vocab \
           + (vocabPrior - 1) * np.sum(diWordDists)

    # and its entropy
    ent = fns.gammaln(wordDistsMatrix.sum(axis=1)).sum() - fns.gammaln(wordDistsMatrix).sum() \
        + np.sum ((wordDistsMatrix - 1) * diWordDists)

    bound -= ent

    return bound
Пример #8
0
def var_bound(data, model, query, z_dnk=None):
    '''
    Determines the variational bounds.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    K, topicPrior, wordPrior, wordDists, weights, negCount, reg, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype
    docLens, topicDists = \
        query.docLens, query.topicDists

    W, X = data.words, data.links
    D, T = W.shape
    minNonZero = 1E-300 if dtype is np.float64 else 1E-30

    # Perform the digamma transform for E[ln \theta] etc.
    topicDists = topicDists.copy()
    diTopicDists = fns.digamma(topicDists[:, :K])
    diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1))
    diWordDists = fns.digamma(model.wordDists)
    diSumWordDists = fns.digamma(model.wordDists.sum(axis=1))

    # E[ln p(topics|topicPrior)] according to q(topics)
    #
    prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \
        + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis]))

    bound += prob_topics

    # and its entropy
    ent_topics = _dirichletEntropy(topicDists[:, :K])
    bound += ent_topics

    # E[ln p(vocabs|vocabPrior)]
    #
    if type(model.vocabPrior) is float or type(model.vocabPrior) is int:
        prob_vocabs  = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \
               + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:,np.newaxis] ))
    else:
        prob_vocabs  = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \
               + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:,np.newaxis] ))

    bound += prob_vocabs

    # and its entropy
    ent_vocabs = _dirichletEntropy(wordDists)
    bound += ent_vocabs

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior)

    prob_words = 0
    prob_z = 0
    ent_z = 0
    for d in range(D):
        wordIdx, z = _infer_topics_at_d(d, data, weights, docLens, topicMeans,
                                        topicPrior, diWordDists,
                                        diSumWordDists)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d]
        prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum()

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(
            W[d, :].data[np.newaxis, :] * z *
            (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis]))

        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

    bound += (prob_z + ent_z + prob_words)

    # Next, the distribution over links - we just focus on the positives in this case
    for d in range(D):
        links = _links_up_to(d, X)
        if len(links) == 0:
            continue

        scores = topicMeans[links, :].dot(weights * topicMeans[d])
        probs = _probit_inplace(scores) + minNonZero
        lnProbs = np.log(probs, out=probs)

        # expected probability of all links from d to p < d such that y_dp = 1
        bound += lnProbs.sum()

    _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
    return bound
Пример #9
0
def var_bound(data, model, query, z_dnk = None):
    '''
    Determines the variational bounds.
    '''
    bound = 0
    
    # Unpack the the structs, for ease of access and efficiency
    K, topicPrior, wordPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype
    docLens, topicDists = \
        query.docLens, query.topicDists

    # Initialize z matrix if necessary
    W,X = data.words, data.links
    D,T = W.shape
        
    # Perform the digamma transform for E[ln \theta] etc.
    topicDists      = topicDists.copy()
    diTopicDists    = fns.digamma(topicDists[:, :K])
    diSumTopicDists = fns.digamma(topicDists[:, :K].sum(axis=1))
    diWordDists     = fns.digamma(model.wordDists)
    diSumWordDists  = fns.digamma(model.wordDists.sum(axis=1))

    # E[ln p(topics|topicPrior)] according to q(topics)
    #
    prob_topics = D * (fns.gammaln(topicPrior[:K].sum()) - fns.gammaln(topicPrior[:K]).sum()) \
        + np.sum((topicPrior[:K] - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis]))

    bound += prob_topics

    # and its entropy
    ent_topics = _dirichletEntropy(topicDists[:, :K])
    bound += ent_topics
        
    # E[ln p(vocabs|vocabPrior)]
    #
    if type(model.vocabPrior) is float or type(model.vocabPrior) is int:
        prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \
               + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] ))
    else:
        prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \
               + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] ))

    bound += prob_vocabs

    # and its entropy
    ent_vocabs = _dirichletEntropy(wordDists)
    bound += ent_vocabs

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior)

    prob_words = 0
    prob_z     = 0
    ent_z      = 0
    for d in range(D):
        wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        exLnTopic = diTopicDists[d, :K] - diSumTopicDists[d]
        prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum()

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis]))
        
        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

    bound += (prob_z + ent_z + prob_words)

    _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
    return bound
Пример #10
0
def varBound (modelState, queryState, X, W, lnVocab = None, XAT=None, XTX = None, scaledWordCounts = None, UTU = None, VTV = None):
    '''
    For a current state of the model, and the query, for given inputs, outputs the variational
    lower-bound.
    
    Params
    
    modelState - the state of the model currently
    queryState - the state of the query currently
    X          - the DxF matrix of features we're querying on, where D is the number of documents
    W          - the DxT matrix of words ("terms") we're querying on
    Z          - if this has already been calculated, it can be passed in. If not, we
                 recalculate it from the model and query states. Z is the DxKxT tensor which
                 for each document D and term T gives the proportion of those terms assigned
                 to topic K
    vocab      - the KxV matrix of the vocabulary distribution
    XAT        - DxK dot product of XA', recalculated if not provided, where X is DxF and A' is FxK
    XTX        - dot product of X-transpose and X, recalculated if not provided.
    UTU        - as above for U
    VTV        - as above for V
    
    Returns
        The (positive) variational lower bound
    '''
    
    # Unpack the model and query state tuples for ease of use and maybe speed improvements
    K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar
    (expLmda, nu, lxi, s, docLen) = (queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen)
    
    lmda = np.log(expLmda)
    
    # Get the number of samples from the shape. Ensure that the shapes are consistent
    # with the model parameters.
    (D, Tcheck) = W.shape
    if Tcheck != T: raise ValueError ("The shape of the DxT document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck))
    
    (Dcheck, Fcheck) = X.shape
    if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D))
    if Fcheck != F: raise ValueError ("The shape of the DxF feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) 

    # We'll need the original xi for this and also Z, the 3D tensor of which for each document D 
    # and term T gives the strength of topic K. We'll also need the log of the vocab dist
    xi = deriveXi (lmda, nu, s)
    
    # If not already provided, we'll also need the following products
    #
    if XAT is None:
        XAT = X.dot(A.T)
    if XTX is None:
        XTX = X.T.dot(X)
    if V is not None and VTV is None:
        VTV = V.T.dot(V)
    if U is not None and UTU is None:
        UTU = U.T.dot(U)
        
    # also need one over the usual variances
    overSsq, overAsq, overKsq, overTsq = 1./sigmaSq, 1./alphaSq, 1./kappaSq, 1./tauSq
    overTkSq = overTsq * overKsq
    overAsSq = overAsq * overSsq
   
    # <ln p(Y)>
    #
    trSigY = 1 if sigY is None else np.trace(sigY)
    trOmY  = 1 if omY  is None else np.trace(omY)
    lnP_Y = -0.5 * (Q*P * LOG_2PI + overTkSq * trSigY * trOmY + overTkSq * np.trace(Y.dot(Y.T)))
    
    # <ln P(A|Y)>
    # TODO it looks like I should take the trace of omA \otimes I_K here.
    # TODO Need to check re-arranging sigY and omY is sensible.
    halfKF = 0.5 * K * F
    
    # Horrible, but varBound can be called by two implementations, one with Y as a matrix-variate
    # where sigY is QxQ and one with Y as a multi-varate, where sigY is a QPxQP.
    A_from_Y = Y.dot(U.T) if V is None else U.dot(Y).dot(V.T)
    A_diff = A - A_from_Y
    varFactorU = np.trace(sigY.dot(np.kron(VTV, UTU))) if sigY.shape[0] == Q*P else np.sum(sigY*UTU)
    varFactorV = 1 if V is None \
        else np.sum(omY * V.T.dot(V))
    lnP_A = -halfKF * LOG_2PI - halfKF * log (alphaSq) -halfKF * log(sigmaSq) \
            -0.5 * (overAsSq * varFactorV * varFactorU \
                      + np.trace(XTX.dot(varA)) * K \
                      + np.sum(np.square(A_diff)))
            
    # <ln p(Theta|A,X)
    # 
    lmdaDiff = lmda - XAT
    lnP_Theta = -0.5 * D * LOG_2PI -0.5 * D * K * log (sigmaSq) \
                -0.5 / sigmaSq * ( \
                    np.sum(nu) + D*K * np.sum(XTX * varA) + np.sum(np.square(lmdaDiff)))
    # Why is order of sigT reversed? It's cause we've not been consistent. A is KxF but lmda is DxK, and
    # note that the distribution of lmda tranpose has the same covariances, just in different positions
    # (i.e. row is col and vice-versa)
    
    # <ln p(Z|Theta)
    # 
    docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi
    scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab)

    lnP_Z = 0.0
    lnP_Z -= np.sum(docLenLmdaLxi * lmda)
    lnP_Z -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi)
    lnP_Z += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi)
    lnP_Z -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda)
    lnP_Z += np.sum (lmda * expLmda * (scaledWordCounts.dot(vocab.T))) # n(d,k) = expLmda * (scaledWordCounts.dot(vocab.T))
    lnP_Z -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2))
    lnP_Z += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi))
    lnP_Z -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi))
    lnP_Z -= np.sum (docLen * s)
        
    # <ln p(W|Z, vocab)>
    # 
    lnP_w_dt = sparseScalarProductOfDot(scaledWordCounts, expLmda, vocab * safe_log(vocab))
    lnP_W = np.sum(lnP_w_dt.data)
    
    # H[q(Y)]
    lnDetOmY  = 0 if omY  is None else safe_log_det(omY)
    lnDetSigY = 0 if sigY is None else safe_log_det(sigY)
    ent_Y = 0.5 * (P * K * LOG_2PI_E + Q * lnDetOmY + P * lnDetSigY)
    
    # H[q(A|Y)]
    #
    # A few things - omA is fixed so long as tau an sigma are, so there's no benefit in
    # recalculating this every time.
    #
    # However in a recent test, la.det(omA) = 0
    # this is very strange as omA is the inverse of (s*I + t*XTX)
    #
    ent_A = 0.5 * (F * K * LOG_2PI_E + K * safe_log_det(varA) + F * K * log (tauSq))\
    
    # H[q(Theta|A)]
    ent_Theta = 0.5 * (K * LOG_2PI_E + np.sum (np.log(nu * nu)))
    
    # H[q(Z|\Theta)
    #
    # So Z_dtk \propto expLmda_dt * vocab_tk. We let N here be the normalizer (which is 
    # \sum_j expLmda_dj * vocab_tj, which implies N is DxT. We need to evaluate
    # Z_dtk * log Z_dtk. We can pull out the normalizer of the first term, but it has
    # to stay in the log Z_dtk expression, hence the third term in the sum. We can however
    # take advantage of the ability to mix dot and element-wise products for the different
    # components of Z_dtk in that three-term sum, which we denote as S
    #   Finally we use np.sum to sum over d and t
    #
    ent_Z = 0 #entropyOfDot(expLmda, vocab)
    
    result = lnP_Y + lnP_A + lnP_Theta + lnP_Z + lnP_W + ent_Y + ent_A + ent_Theta + ent_Z
    
    return result
Пример #11
0
def var_bound(data, model, query, z_dnk=None):
    '''
    Determines the variational bounds.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \
        query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts
    K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \
     model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name

    W, L = data.words, data.links
    D, T = W.shape
    bound = 0

    # Pre-calculate some repeated expressinos
    logVagueness = log(Vagueness)
    halfDQ, halfQK, halfDK = 0.5 * D * Q, 0.5 * Q * K, 0.5 * D * K
    logTwoPi = log(2 * pi)
    logTwoPiE = log(2 * pi * e)

    # # E[ln p(U)]
    # bound += -halfDQ * logTwoPi - D * Q * logVagueness - 0.5 * np.sum(U * U) # trace of U U'
    #
    # # H[q(U)]
    # bound += -halfDQ * logTwoPiE - D * Q * logVagueness
    #
    # # E[ln p(V)]
    # bound += -halfQK * logTwoPi - Q * K * logVagueness - 0.5 * np.sum(V * V) # trace of U U'
    #
    # # H[q(V)]
    # bound += -halfQK * logTwoPiE - D * Q * logVagueness

    # ln p(Topics|U, V)
    logDetCov = log(la.det(topicCov))
    kernel = topics.copy()
    kernel -= U.dot(V)
    kernel **= 2
    kernel[:] = kernel.dot(topicCov)
    kernel /= (2 * Vagueness)
    bound += -halfDK * logTwoPi - halfDK * logVagueness \
             -D * 0.5 * logDetCov \
             -np.sum(kernel) \
             -np.sum(postTopicCov)
    # FIXME bound here is squiffy

    # H[q(topics)]
    bound += -halfDK * logTwoPiE - halfDK * logVagueness - D * 0.5 * logDetCov

    # We'll need these for the next steps
    diWordDists = fns.digamma(wordDists)
    diWordDistSums = fns.digamma(wordDists.sum(axis=1))

    # P(z|topic) and P(y|topic) are not stored explicitly, so we need to
    # recalculate here to calculate their expected log-probs and entropies.
    prob_words, prob_links = 0, 0
    prob_z, ent_z = 0, 0
    prob_y, ent_y = 0, 0
    for d in range(D):
        # First the word-topic assignments, note this is a KxV matrix
        wordIdx, z = _infer_word_topics_at_d(d, W, topics, diWordDists,
                                             diWordDistSums)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        prob_z += topics[d, :].dot(z * W[d, :].data[np.newaxis, :]).sum()
        prob_z -= docLens[d] * lse(topics[d, :])

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(
            W[d, :].data[np.newaxis, :] * z *
            (diWordDists[:, wordIdx] - diWordDistSums[:, np.newaxis]))

        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

        # Next the link-topic assignments, note this is a PxK matrix
        linkIdx, y = _infer_link_topics_at_d(d, L, topics, lse_at_k)

        # Here we _start_ with the entropy of y
        ent_y -= np.dot(L[d, :].data, y * safe_log(y)).sum()

        # E[ln p(Y|topics) = sum_d sum_m sum_k E[y_dmk] E[ln topicDist_dk]
        y *= L[d, :].data[:, np.newaxis]
        prob_y += y.dot(topics[d, :].T).sum()
        prob_y -= out_counts[d] * lse(topics[d, :])

        # E[ln p(L|Y)] = sum_d sum_m sum_k sum_t E[y_dmk] l_dmp E[ln topics_pk]
        prob_links += y.dot(topics[linkIdx, :].T).sum()
        prob_links -= y.dot(lse_at_k).sum()

    bound += (prob_z + ent_z + prob_words)
    bound += (prob_y + ent_y + prob_links)

    return bound
Пример #12
0
def varBound (modelState, queryState, X, W, Z = None, lnVocab = None, varA_U = None, XA = None, XTX = None):
    '''
    For a current state of the model, and the query, for given inputs, outputs the variational
    lower-bound.
    
    Params
    
    modelState - the state of the model currently
    queryState - the state of the query currently
    X          - the DxF matrix of features we're querying on, where D is the number of documents
    W          - the DxT matrix of words ("terms") we're querying on
    Z          - if this has already been calculated, it can be passed in. If not, we
                 recalculate it from the model and query states. Z is the DxKxT tensor which
                 for each document D and term T gives the proportion of those terms assigned
                 to topic K
    lnVocab    - the KxV matrix of the natural log applied to the vocabularly. Recalculated if
                 not provided
    varA_U     - the product of the column variance matrix and the matrix U. Recalculated if
                 not provided
    XA         - dot product of X and A, recalculated if not provided
    XTX        - dot product of X-transpose and X, recalculated if not provided.
    
    Returns
    The (positive) variational lower bound
    '''
    
    # Unpack the model and query state tuples for ease of use and maybe speed improvements
    (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab)
    (lmda, nu, lxi, s, docLen) = (queryState.lmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen)
    
    # Get the number of samples from the shape. Ensure that the shapes are consistent
    # with the model parameters.
    (D, Tcheck) = W.shape
    if Tcheck != T: raise ValueError ("The shape of the document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck))
    
    (Dcheck, Fcheck) = X.shape
    if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D))
    if Fcheck != F: raise ValueError ("The shape of the feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) 

    # We'll need the original xi for this and also Z, the 3D tensor of which for each document D 
    #and term T gives the strenght of topic K. We'll also need the log of the vocab dist
    xi = deriveXi (lmda, nu, s)
    
    if lnVocab is None:
        lnVocab = safe_log(vocab)
    if Z is None:
        Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV
   
    
    # lnProb1 is the bound on E[p(W|Theta)]. This is a bound, not an equality as we're using
    # Bouchard's softmax bound (NIPS 2007) here. That said, most of the subsequent terms
    # will discard additive constants, so strictly speaking none of them are equalities
    docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi
    
    lnProb1 = 0.0
    lnProb1 -= np.sum(docLenLmdaLxi * lmda)
    lnProb1 -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi)
    lnProb1 += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi)
    lnProb1 -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda)
    lnProb1 += np.sum (lmda * np.einsum ('dt,dkt->dk', W, Z))
    
    lnProb1 += np.sum(lnVocab * np.einsum('dt,dkt->kt', W, Z))
    lnProb1 -= np.sum(W * np.einsum('dkt->dt', safe_x_log_x(Z)))
    
    lnProb1 -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2))
    lnProb1 += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi))
    lnProb1 -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi))
        
    # lnProb2 is E[p(Theta|A)]
    if XA is None:
        XA = X.dot(A)
    if XTX is None:
        XTX = X.T.dot(X)
    sig2  = sigma * sigma
    tau2  = tau * tau
    
    lnProb2 = -0.5 * D * K * log (sig2) \
            -  0.5 / sig2 * (np.sum(nu) + D*K * tau2 * np.sum(XTX * varA) + np.sum((lmda - XA)**2))
    
    # lnProb3 is E[p(A|V)]
    if varA_U is None:
        varA_U = varA.dot(U)
        
    lnProb3 = -0.5 * K * F * log (2 * pi) \
          -0.5 * K * F * log(tau2) \
          -0.5 / tau2 * \
          ( \
          np.trace(varA)*K*tau2 \
          + np.sum(varA_U * U) * K * tau2  \
          + np.sum((A - U.dot(V)) ** 2) \
          )
          
    # lnProb4 is E[p(V)]
    lnProb4 = -0.5 * (np.trace(varV) * K * tau2 + np.sum(V*V))
    
    # ent1 is H[q(Theta)]
    ent1 = 0.5 * np.sum (np.log(nu * nu))
    
    # ent2 is H[q(A|V)]
    ent2 = 0.5 * F * K + log(2 * pi * e) + 0.5 * K * log (la.det(varA)) + 0.5 * F * K * log (tau2)
    
    # ent3 is H[q(V)]
    ent3 = 0.5 * P * K * log (2 * pi * e) + 0.5 * K * log (la.det(varV)) + 0.5 * P * K * log (tau2)
    
    result = lnProb1 + lnProb2 + lnProb3 + lnProb4 + ent1 + ent2 + ent3
#    if (lnProb1 > 0) or (lnProb2 > 0) or (lnProb3 > 0) or (lnProb4 > 0):
#        print ("Whoopsie - lnProb > 0")
    
#    if result > 100:
#        print ("Well this is just ridiculous")
    
    return result
Пример #13
0
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0):
    '''
    Creates a new query state object for a topic model based on side-information. 
    This contains all those estimated parameters that are specific to the actual
    date being queried - this must be used in conjunction with a model state.
    
    The parameters are
    
    modelState - the model state with all the model parameters
    X - the D x F matrix of side information vectors
    W - the D x V matrix of word **count** vectors.
    iterations - how long to iterate for
    epsilon - currently ignored, in future, allows us to stop early.
    
    This returns a tuple of new model-state and query-state. The latter object will
    contain X and W and also
    
    s      - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk 
    lxi    - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the 
             quadratic term xi
    lambda - the topics we've inferred for the current batch of documents
    nu     - the variance of topics we've inferred (independent)
    '''
    # Unpack the model state tuple for ease of use and maybe speed improvements
    (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab)
       
    # Get ready to plot the evolution of the likelihood
    if logInterval > 0:
        elbos = np.zeros((iterations / logInterval,))
        iters = np.zeros((iterations / logInterval,))
    
    # We'll need the total word count per doc, and total count of docs
    docLen = W.sum(axis=1)
    D      = len(docLen)
    
    # No need to recompute this every time
    XTX = X.T.dot(X)
    
    # Assign initial values to the query parameters
    lmda = rd.random((D, K))
    nu   = np.ones((D,K), np.float64)
    s    = np.zeros((D,))
    lxi  = negJakkola (np.ones((D, K), np.float64))
    
    XA = X.dot(A)
    for iteration in range(iterations):
        
        # Save repeated computation
        tsq      = tau * tau;
        tsqIP    = tsq * np.eye(P)
        trTsqIK  = K * tsq # trace of the matrix tau * tau * np.eye(K)
        halfSig2 = 1./(sigma*sigma)
        tau2sig2 = (tau * tau) / (sigma * sigma)
        
        # =============================================================
        # E-Step
        #   Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V)
        #   Where lambda is the posterior mean of theta.
        # =============================================================
        
        #
        # V, varV
        varV = la.inv (tsqIP + U.T.dot(U))
        V    = varV.dot(U.T).dot(A)
        _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # A, varA
        # TODO, since only tau2sig2 changes at each step, would it be possible just to
        # amend the old inverse?
        # TODO Use sparse inverse
        varA = la.inv (tau2sig2 * XTX + np.eye(F))
        A    = varA.dot (U.dot(V) + X.T.dot(lmda))
        XA   = X.dot(A)
        _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
       
        #
        # lmda_dk
        lnVocab = safe_log (vocab)
        Z   = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT
        rho = 2 * s[:,np.newaxis] * lxi - 0.5 \
            + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis]
        
        rhs  = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A)
        lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2)
        
        _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
              
        
        #
        # nu_dk
        # TODO Double check this again...
        nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2)

        _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        # =============================================================
        # M-Step
        #    Parameters for the softmax bound: lxi and s
        #    The projection used for A: U
        #    The vocabulary : vocab
        #    The variances: tau, sigma
        # =============================================================
        
        #
        # s_d
#         s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1)
#         _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        

        #
        # xi_dk
        lxi = negJakkolaOfDerivedXi(lmda, nu, s)
        _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # vocab
        #
        # TODO, since vocab is in the RHS, is there any way to optimize this?
        Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV
        vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z))
        _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # U
        U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T)))
        _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen)
        
        #
        # sigma
        #    Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right)
        #
#        sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2))
        
        #
        # tau
        #    Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right)
        #
        varA_U = varA.dot(U)
#        tau_term1 = np.trace(varA)*K*tsq
#        tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq
#        tau_term3 = np.sum((A - U.dot(V)) ** 2)
#        
#        tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3)
        
        if (logInterval > 0) and (iteration % logInterval == 0):
            elbo = varBound ( \
                VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
                VbSideTopicQueryState(lmda, nu, lxi, s, docLen),
                X, W, Z, lnVocab, varA_U, XA, XTX)
                
            elbos[iteration / logInterval] = elbo
            iters[iteration / logInterval] = iteration
            print ("Iteration %5d  ELBO %f" % (iteration, elbo))
        
    if logInterval > 0:
        plot_bound(iters, elbos)
    
    return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \
            VbSideTopicQueryState (lmda, nu, lxi, s, docLen))