def log_likelihood(data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. ''' probs = rowwise_softmax(queryState.means) doc_dist = colwise_softmax(queryState.means) word_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.words, \ probs, \ modelState.vocab \ ).data \ ) link_likely = np.sum( \ sparseScalarProductOfSafeLnDot(\ data.links, \ probs, \ doc_dist \ ).data \ ) return word_likely + link_likely
def log_likelihood (data, modelState, queryState): ''' Return the log-likelihood of the given data according to the model and the parameters inferred for datapoints in the query-state object Actually returns a vector of D document specific log likelihoods ''' topicProbs = topicDists(queryState) wordLikely = sparseScalarProductOfSafeLnDot(data.words, topicProbs, wordDists(modelState)).sum() docProbs = np.empty((modelState.K, data.doc_count), dtype=modelState.dtype) docProbs[:,:] = topicProbs.T linkLikely = sparseScalarProductOfSafeLnDot(data.words, topicProbs, docProbs).sum() return wordLikely + linkLikely
def log_likelihood(modelState, X, W, queryState): ''' Returns the log likelihood of the given features and words according to the given model. modelState - the model, provided by #train() - to use to evaluate the data X - the DxF matrix of features W - the DxT matrix of words Return: The marginal likelihood of the data ''' if W.dtype.kind == 'i': # for the sparseScalorProductOf() method to work W = W.astype(DTYPE) F, T, vocab = modelState.F, modelState.T, modelState.vocab assert X.shape[1] == F, "Model is trained to expect " + str(F) + " features but feature-matrix has " + str(X.shape[1]) + " features" assert W.shape[1] == T, "Model is trained to expect " + str(T) + " words, but word-matrix has " + str(W.shape[1]) + " words" expLmda = queryState.expLmda; row_sums = expLmda.sum(axis=1) expLmda /= row_sums[:, np.newaxis] # converts it to a true distribution likely = np.sum (sparseScalarProductOfSafeLnDot(W, expLmda, vocab).data) # Revert expLmda to its original value as this is a ref to, not a copy of, the original matrix expLmda *= row_sums[:, np.newaxis] return likely
def var_bound(data, model, query, topicDistOverride=None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype tops = topicDistOverride \ if topicDistOverride is not None \ else topicDists(query) # Initialize z matrix if necessary W = data.words D, T = W.shape wordLikely = sparseScalarProductOfSafeLnDot(data.words, tops, wordDists(model)).sum() topicLikely = topicMeans.dot(fns.digamma(corpusTopicDist) - fns.digamma(corpusTopicDist.sum())) # Expected joint like = W.dot(safe_log(wordDists).T) # D*K like += corpusTopicDist[np.newaxis,:] like *= safe_log(topicMeans) # Entropy ent = (-topicMeans * safe_log(topicMeans)).sum() return like.sum() + ent
def log_likelihood(data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. Actually returns a vector of D document specific log likelihoods ''' return sparseScalarProductOfSafeLnDot(data.words, topicDists(queryState), wordDists(modelState)).sum()
def log_likelihood(data, model, query): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. ''' W = data.words if data.words.dtype is model.dtype else data.words.astype( model.dtype) return sparseScalarProductOfSafeLnDot(W, topicDists(query), wordDists(model)).sum()
def log_likelihood (data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. ''' return np.sum( \ sparseScalarProductOfSafeLnDot(\ data.words, \ rowwise_softmax(queryState.means), \ modelState.vocab \ ).data \ )
def log_likelihood_point(data, model, query, topicDistOverride=None): ''' Return the log-likelihood of the given data according to the model and the parameters inferred for datapoints in the query-state object Actually returns a vector of D document specific log likelihoods ''' tops = topicDistOverride \ if topicDistOverride is not None \ else topicDists(query) wordLikely = sparseScalarProductOfSafeLnDot(data.words, tops, wordDists(model)).sum() return wordLikely
def log_likelihood_point(data, modelState, queryState): ''' Return the log-likelihood of the given data W according to the model and the parameters inferred for the entries in W stored in the queryState object. Actually returns a vector of D document specific log likelihoods ''' n_dk, n_kt = queryState.n_dk, modelState.n_kt a, b = modelState.topicPrior, modelState.vocabPrior if type(a) is float or np.isscalar(a): a = constantArray((modelState.K, ), a, modelState.dtype) W = data.words if data.words.dtype is modelState.dtype \ else data.words.astype(modelState.dtype) n_dk += a[np.newaxis, :] n_kt += b # Scale to create distributions over doc-topics and topic-vocabs doc_norm = n_dk.sum(axis=1) voc_norm = n_kt.sum(axis=1) n_dk /= doc_norm[:, np.newaxis] n_kt /= voc_norm[:, np.newaxis] # Use distributions to create log-likelihood. This could be made # faster still by not materializing the (admittedly sparse) matrix ln_likely = sparseScalarProductOfSafeLnDot(W, n_dk, n_kt).sum() # Rescale back to word-counts n_dk *= doc_norm[:, np.newaxis] n_kt *= voc_norm[:, np.newaxis] n_dk -= a[np.newaxis, :] n_kt -= b return ln_likely
def log_likelihood(data, modelState, queryState): ''' Return the log-likelihood of the given data W and X according to the model and the parameters inferred for the entries in W and X stored in the queryState object. Actually returns a vector of D document specific log likelihoods ''' wordLikely = sparseScalarProductOfSafeLnDot(data.words, topicDists(queryState), wordDists(modelState)).sum() # For likelihood it's a bit tricky. In theory, given d =/= p, and letting # c_d = 1/n_d, where n_d is the word count of document d, it's # # ln p(y_dp|weights) = E[\sum_k weights[k] * (c_d \sum_n z_dnk) * (c_p \sum_n z_pnk)] # = \sum_k weights[k] * c_d * E[\sum_n z_dnk] * c_p * E[\sum_n z_pnk] # = \sum_k weights[k] * topicDistsMean[d,k] * topicDistsMean[p,k] # # # where topicDistsMean[d,k] is the mean of the k-th element of the Dirichlet parameterised # by topicDist[d,:] # # However in the related paper on Supervised LDA, which uses this trick of average z_dnk, # they explicitly say that in the likelihood calculation they use the expectation # according to the _variational_ approximate posterior distribution q(z_dn) instead of the # actual distribution p(z_dn|topicDist), and thus # # E[\sum_n z_dnk] = \sum_n E_q[z_dnk] # # There's no detail of the likelihood in either of the RTM papers, so we use the # variational approch linkLikely = 0 return wordLikely + linkLikely
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W = data.words D,_ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A # Calculate some implicit variables isigT = la.inv(sigT) bound = 0 if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(sigT) bound += 0.5 * NIW_PSI * np.trace(isigT) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1./pseudoObsMeans) * safe_log_det(sigT) bound -= 0.5 / pseudoObsMeans * (topicMean).T.dot(isigT).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(sigT) # + a constant # Distribution over document topics bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * la.det(sigT) diff = means - topicMean[np.newaxis,:] bound -= 0.5 * np.sum (diff.dot(isigT) * diff) bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix V = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * V) bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:,np.newaxis] * V * (np.diag(A))[np.newaxis,:]) bound -= np.sum(means * V) return bound
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in repeatedly. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, T, F = W.shape[0], W.shape[1], X.shape[1] means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) Log2Pi = log(2 * pi) bound = 0 # U and V are parameters with no distribution # # Y has a normal distribution, it's covariance is unfortunately an expensive computation # P, Q = U.shape[1], V.shape[1] covY = np.eye(P * Q) * (lfv * ltv) covY += np.kron(V.T.dot(V), U.T.dot(U)) covY = la.inv(covY, overwrite_a=True) # The expected likelihood of Y bound -= 0.5 * P * Q * Log2Pi bound -= 0.5 * P * Q * log(ltv * lfv) bound -= 0.5 / (lfv * ltv) * np.sum( Y * Y) # 5x faster than np.trace(Y.dot(Y.T)) bound -= 0.5 * np.trace(covY) * (lfv * ltv) # the traces of the posterior+prior covariance products cancel out across likelihoods # The entropy of Y bound += 0.5 * P * Q * (Log2Pi + 1) + 0.5 * safe_log_det(covY) # # A has a normal distribution/ # F, K = A.shape[0], A.shape[1] diff = A - U.dot(Y).dot(V.T) diff *= diff # The expected likelihood of A bound -= 0.5 * K * F * Log2Pi bound -= 0.5 * K * F * log(tv * fv) bound -= 0.5 / (fv * tv) * np.sum(diff) # The entropy of A bound += 0.5 * F * K * (Log2Pi + 1) + 0.5 * K * safe_log_det(covA) # # Theta, the matrix of means, has a normal distribution. Its row-covarince is diagonal # (i.e. it's several independent multi-var normal distros). The posterior is made # up of D K-dimensional normals with diagonal covariances # # We iterate through the topics in batches, to control memory use batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) feats = np.ndarray(shape=(batchSize, F), dtype=dtype) tops = np.ndarray(shape=(batchSize, K), dtype=dtype) trace = 0 for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) feats[:batchSize, :] = X[start:end, :].toarray() np.dot(feats[:batchSize, :], A, out=tops[:batchSize, :]) tops[:batchSize, :] -= means[start:end, :] tops[:batchSize, :] *= tops[:batchSize, :] trace += np.sum(tops[:batchSize, :]) feats = None # The expected likelihood of the topic-assignments bound -= 0.5 * D * K * Log2Pi bound -= 0.5 * D * K * log(tv) bound -= 0.5 / tv * trace bound -= 0.5 * tv * np.sum(covA) # this trace doesn't cancel as we # don't have a posterior on tv # The entropy of the topic-assignments bound += 0.5 * D * K * (Log2Pi + 1) + 0.5 * np.sum(covA) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means # Again we batch this for safety batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) V = np.ndarray(shape=(batchSize, K), dtype=dtype) for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) meansBatch = means[start:end, :] docLensBatch = docLens[start:end] np.exp(meansBatch - meansBatch.max(axis=1)[:, np.newaxis], out=tops[:batchSize, :]) expMeansBatch = tops[:batchSize, :] R = sparseScalarQuotientOfDot( W, expMeansBatch, vocab, start=start, end=end ) # BatchSize x V: [W / TB] is the quotient of the original over the reconstructed doc-term matrix V[:batchSize, :] = expMeansBatch * (R[:batchSize, :].dot(vocab.T) ) # BatchSize x K VBatch = V[:batchSize, :] bound += np.sum(docLensBatch * np.log(np.sum(expMeansBatch, axis=1))) bound += np.sum( sparseScalarProductOfSafeLnDot(W, expMeansBatch, vocab, start=start, end=end).data) bound += np.sum(meansBatch * VBatch) bound += np.sum(2 * ssp.diags(docLensBatch, 0) * meansBatch.dot(H) * meansBatch) bound -= 2. * scaledSelfSoftDot(meansBatch, docLensBatch) bound -= 0.5 * np.sum(docLensBatch[:, np.newaxis] * VBatch * (np.diag(H))[np.newaxis, :]) bound -= np.sum(meansBatch * VBatch) return bound
def var_bound(data, modelState, queryState, XTX = None): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, _ = W.shape means, varcs, lxi, s, docLens = queryState.means, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.dtype # Calculate some implicit variables xi = ctm._deriveXi(means, varcs, s) isigT = la.inv(sigT) #lnDetSigT = np.log(la.det(sigT)) lnDetSigT = lnDetOfDiagMat(sigT) verifyProper(lnDetSigT, "lnDetSigT") if XTX is None: XTX = X.T.dot(X) bound = 0 # Distribution over latent space bound -= (P*K)/2. * LN_OF_2_PI bound -= P * lnDetSigT bound -= K * P * log(lfv) bound -= 0.5 * np.sum(1./lfv * isigT.dot(Y) * Y) bound -= 0.5 * K * np.trace(R_Y) # And its entropy detR_Y = safeDet(R_Y, "R_Y") bound += 0.5 * LN_OF_2_PI_E + P/2. * lnDetSigT + K/2. * log(detR_Y) # Distribution over mapping from features to topics diff = (A - Y.dot(V)) bound -= (F*K)/2. * LN_OF_2_PI bound -= F * lnDetSigT bound -= K * P * log(fv) bound -= 0.5 * np.sum (1./lfv * isigT.dot(diff) * diff) bound -= 0.5 * K * np.trace(R_A) # And its entropy detR_A = safeDet(R_A, "R_A") bound += 0.5 * LN_OF_2_PI_E + F/2. * lnDetSigT + K/2. * log(detR_A) # Distribution over document topics bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * lnDetSigT diff = means - X.dot(A.T) bound -= 0.5 * np.sum (diff.dot(isigT) * diff) bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. bound -= 0.5 * K * np.trace(XTX.dot(R_A)) # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments # This also takes into account all the variables that # constitute the bound on log(sum_j exp(mean_j)) and # also incorporates the implicit entropy of Z_dvk bound -= np.sum((means*means + varcs) * docLens[:,np.newaxis] * lxi) bound += np.sum(means * 2 * docLens[:,np.newaxis] * s[:,np.newaxis] * lxi) bound += np.sum(means * -0.5 * docLens[:,np.newaxis]) # The last term of line 1 gets cancelled out by part of the first term in line 2 # so neither are included here. row_maxes = means.max(axis=1) means -= row_maxes[:,np.newaxis] expMeans = np.exp(means, out=means) bound -= -np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound -= np.sum(docLens[:,np.newaxis] * lxi * ((s*s)[:,np.newaxis] - (xi * xi))) bound += np.sum(0.5 * docLens[:,np.newaxis] * (s[:,np.newaxis] + xi)) # bound -= np.sum(docLens[:,np.newaxis] * safe_log_one_plus_exp_of(xi)) bound -= scaledSumOfLnOnePlusExp(docLens, xi) bound -= np.dot(s, docLens) means = np.log(expMeans, out=expMeans) means += row_maxes[:,np.newaxis] return bound
def var_bound(data, modelState, queryState, XTX=None): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, _ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.Ab, modelState.dtype # Calculate some implicit variables isigT = la.inv(sigT) lnDetSigT = lnDetOfDiagMat(sigT) verifyProper(lnDetSigT, "lnDetSigT") if XTX is None: XTX = X.T.dot(X) bound = 0 # Distribution over latent space bound -= (P * K) / 2. * LN_OF_2_PI bound -= P * lnDetSigT bound -= K * P * log(lfv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(Y) * Y) bound -= 0.5 * K * np.trace(R_Y) # And its entropy detR_Y = safeDet(R_Y, "R_Y") bound += 0.5 * LN_OF_2_PI_E + P / 2. * lnDetSigT + K / 2. * log(detR_Y) # Distribution over mapping from features to topics diff = (A - Y.dot(V)) bound -= (F * K) / 2. * LN_OF_2_PI bound -= F * lnDetSigT bound -= K * P * log(fv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(diff) * diff) bound -= 0.5 * K * np.trace(R_A) # And its entropy detR_A = safeDet(R_A, "R_A") bound += 0.5 * LN_OF_2_PI_E + F / 2. * lnDetSigT + K / 2. * log(detR_A) # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * lnDetSigT diff = means - X.dot(A.T) bound -= 0.5 * np.sum(diff.dot(isigT) * diff) bound -= 0.5 * np.sum( varcs * np.diag(isigT)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. bound -= 0.5 * K * np.trace(XTX.dot(R_A)) # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments, and their entropy # and distribution over words. This is re-arranged as we need # means for some parts, and exp(means) for other parts expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot( W, expMeans, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix S = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * S) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(Ab) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * S * (np.diag(Ab))[np.newaxis, :]) bound -= np.sum(means * S) return bound
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, L, X = data.words, data.links, data.feats D, _ = W.shape means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A # Calculate some implicit variables itopicCov = la.inv(topicCov) bound = 0 expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(topicCov) bound += 0.5 * NIW_PSI * np.trace(itopicCov) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1. / pseudoObsMeans) * safe_log_det(topicCov) bound -= 0.5 / pseudoObsMeans * ( topicMean).T.dot(itopicCov).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(topicCov) # + a constant # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * la.det(topicCov) diff = means - topicMean[np.newaxis, :] bound -= 0.5 * np.sum(diff.dot(itopicCov) * diff) bound -= 0.5 * np.sum( varcs * np.diag(itopicCov)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy, and similaarly for out-links. This is somewhat jumbled to # avoid repeatedly taking the exp and log of the means W_weights = sparseScalarQuotientOfDot( W, expMeansOut, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix w_top_sums = expMeansOut * (W_weights.dot(vocab.T)) # D x K L_weights = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k) l_top_sums = L_weights.dot(expMeansIn) / lse_at_k[ np.newaxis, :] * expMeansOut bound += np.sum(docLens * np.log(np.sum(expMeansOut, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeansOut, vocab).data) # means = np.log(expMeans, out=expMeans) #means = safe_log(expMeansOut, out=means) bound += np.sum(means * w_top_sums) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * w_top_sums * (np.diag(A))[np.newaxis, :]) bound -= np.sum(means * w_top_sums) return bound