def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) # Update the Variances varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1]) debugFn (0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 R = W.copy() for itr in range(iterations): expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d,:] = la.inv(isigT + n[d] * A).dot(rhs[d,:]) debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, varcs, n = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) itopicCov = la.inv(topicCov) # Update the Variances varcs = 1. / ((n * (K - 1.) / K)[:, np.newaxis] + itopicCov.flat[::K + 1]) debugFn(0, varcs, "varcs", W, K, topicMean, topicCov, vocab, dtype, means, varcs, A, n) R = W.copy() for itr in range(iterations): R = sparseScalarQuotientOfDot(W, expMeansOut, vocab, out=R) V = expMeansOut * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:, np.newaxis] * means.dot(A) + itopicCov.dot(topicMean) rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(itopicCov + n[d] * A).dot(rhs[d, :]) debugFn(itr, means, "means", W, K, topicMean, topicCov, vocab, dtype, means, varcs, A, n) return modelState, queryState
def testscaleProductOfQuotient(self): rd.seed(0xC0FFEE) D = 100 T = 200 K = 16 W_d = np.floor(rd.random((D, T)) * 1.4) W_s = ssp.csr_matrix(W_d) topics = rd.random((D, K)) vocab = rd.random((K, T)) expected = W_d / topics.dot(vocab) received = sparseScalarQuotientOfDot(W_s, topics, vocab) diff = np.asarray(expected - received.todense()) trNorm = np.sum(diff * diff) print(str(trNorm)) print(str(diff))
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W = data.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] sigT = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag) sigT /= (D + pseudoObsVar - K) else: sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1./ diag) else: isigT = la.inv(sigT) # FIXME Undo debug sigT = np.eye(K) isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # print(" sigT.det = " + str(la.det(sigT))) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + isigT)^{-1} varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT)) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # Update the Means rhs = V.copy() rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :]) # means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def varBound (modelState, queryState, X, W, lnVocab = None, XAT=None, XTX = None, scaledWordCounts = None, UTU = None, VTV = None): ''' For a current state of the model, and the query, for given inputs, outputs the variational lower-bound. Params modelState - the state of the model currently queryState - the state of the query currently X - the DxF matrix of features we're querying on, where D is the number of documents W - the DxT matrix of words ("terms") we're querying on Z - if this has already been calculated, it can be passed in. If not, we recalculate it from the model and query states. Z is the DxKxT tensor which for each document D and term T gives the proportion of those terms assigned to topic K vocab - the KxV matrix of the vocabulary distribution XAT - DxK dot product of XA', recalculated if not provided, where X is DxF and A' is FxK XTX - dot product of X-transpose and X, recalculated if not provided. UTU - as above for U VTV - as above for V Returns The (positive) variational lower bound ''' # Unpack the model and query state tuples for ease of use and maybe speed improvements K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar (expLmda, nu, lxi, s, docLen) = (queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen) lmda = np.log(expLmda) # Get the number of samples from the shape. Ensure that the shapes are consistent # with the model parameters. (D, Tcheck) = W.shape if Tcheck != T: raise ValueError ("The shape of the DxT document matrix W is invalid, T is %d but the matrix W has shape (%d, %d)" % (T, D, Tcheck)) (Dcheck, Fcheck) = X.shape if Dcheck != D: raise ValueError ("Inconsistent sizes between the matrices X and W, X has %d rows but W has %d" % (Dcheck, D)) if Fcheck != F: raise ValueError ("The shape of the DxF feature matrix X is invalid. F is %d but the matrix X has shape (%d, %d)" % (F, Dcheck, Fcheck)) # We'll need the original xi for this and also Z, the 3D tensor of which for each document D # and term T gives the strength of topic K. We'll also need the log of the vocab dist xi = deriveXi (lmda, nu, s) # If not already provided, we'll also need the following products # if XAT is None: XAT = X.dot(A.T) if XTX is None: XTX = X.T.dot(X) if V is not None and VTV is None: VTV = V.T.dot(V) if U is not None and UTU is None: UTU = U.T.dot(U) # also need one over the usual variances overSsq, overAsq, overKsq, overTsq = 1./sigmaSq, 1./alphaSq, 1./kappaSq, 1./tauSq overTkSq = overTsq * overKsq overAsSq = overAsq * overSsq # <ln p(Y)> # trSigY = 1 if sigY is None else np.trace(sigY) trOmY = 1 if omY is None else np.trace(omY) lnP_Y = -0.5 * (Q*P * LOG_2PI + overTkSq * trSigY * trOmY + overTkSq * np.trace(Y.dot(Y.T))) # <ln P(A|Y)> # TODO it looks like I should take the trace of omA \otimes I_K here. # TODO Need to check re-arranging sigY and omY is sensible. halfKF = 0.5 * K * F # Horrible, but varBound can be called by two implementations, one with Y as a matrix-variate # where sigY is QxQ and one with Y as a multi-varate, where sigY is a QPxQP. A_from_Y = Y.dot(U.T) if V is None else U.dot(Y).dot(V.T) A_diff = A - A_from_Y varFactorU = np.trace(sigY.dot(np.kron(VTV, UTU))) if sigY.shape[0] == Q*P else np.sum(sigY*UTU) varFactorV = 1 if V is None \ else np.sum(omY * V.T.dot(V)) lnP_A = -halfKF * LOG_2PI - halfKF * log (alphaSq) -halfKF * log(sigmaSq) \ -0.5 * (overAsSq * varFactorV * varFactorU \ + np.trace(XTX.dot(varA)) * K \ + np.sum(np.square(A_diff))) # <ln p(Theta|A,X) # lmdaDiff = lmda - XAT lnP_Theta = -0.5 * D * LOG_2PI -0.5 * D * K * log (sigmaSq) \ -0.5 / sigmaSq * ( \ np.sum(nu) + D*K * np.sum(XTX * varA) + np.sum(np.square(lmdaDiff))) # Why is order of sigT reversed? It's cause we've not been consistent. A is KxF but lmda is DxK, and # note that the distribution of lmda tranpose has the same covariances, just in different positions # (i.e. row is col and vice-versa) # <ln p(Z|Theta) # docLenLmdaLxi = docLen[:, np.newaxis] * lmda * lxi scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab) lnP_Z = 0.0 lnP_Z -= np.sum(docLenLmdaLxi * lmda) lnP_Z -= np.sum(docLen[:, np.newaxis] * nu * nu * lxi) lnP_Z += 2 * np.sum (s[:, np.newaxis] * docLenLmdaLxi) lnP_Z -= 0.5 * np.sum (docLen[:, np.newaxis] * lmda) lnP_Z += np.sum (lmda * expLmda * (scaledWordCounts.dot(vocab.T))) # n(d,k) = expLmda * (scaledWordCounts.dot(vocab.T)) lnP_Z -= np.sum(docLen[:,np.newaxis] * lxi * ((s**2)[:,np.newaxis] - xi**2)) lnP_Z += 0.5 * np.sum(docLen[:,np.newaxis] * (s[:,np.newaxis] + xi)) lnP_Z -= np.sum(docLen[:,np.newaxis] * safe_log_one_plus_exp_of(xi)) lnP_Z -= np.sum (docLen * s) # <ln p(W|Z, vocab)> # lnP_w_dt = sparseScalarProductOfDot(scaledWordCounts, expLmda, vocab * safe_log(vocab)) lnP_W = np.sum(lnP_w_dt.data) # H[q(Y)] lnDetOmY = 0 if omY is None else safe_log_det(omY) lnDetSigY = 0 if sigY is None else safe_log_det(sigY) ent_Y = 0.5 * (P * K * LOG_2PI_E + Q * lnDetOmY + P * lnDetSigY) # H[q(A|Y)] # # A few things - omA is fixed so long as tau an sigma are, so there's no benefit in # recalculating this every time. # # However in a recent test, la.det(omA) = 0 # this is very strange as omA is the inverse of (s*I + t*XTX) # ent_A = 0.5 * (F * K * LOG_2PI_E + K * safe_log_det(varA) + F * K * log (tauSq))\ # H[q(Theta|A)] ent_Theta = 0.5 * (K * LOG_2PI_E + np.sum (np.log(nu * nu))) # H[q(Z|\Theta) # # So Z_dtk \propto expLmda_dt * vocab_tk. We let N here be the normalizer (which is # \sum_j expLmda_dj * vocab_tj, which implies N is DxT. We need to evaluate # Z_dtk * log Z_dtk. We can pull out the normalizer of the first term, but it has # to stay in the log Z_dtk expression, hence the third term in the sum. We can however # take advantage of the ability to mix dot and element-wise products for the different # components of Z_dtk in that three-term sum, which we denote as S # Finally we use np.sum to sum over d and t # ent_Z = 0 #entropyOfDot(expLmda, vocab) result = lnP_Y + lnP_A + lnP_Theta + lnP_Z + lnP_W + ent_Y + ent_A + ent_Theta + ent_Z return result
def query(modelState, X, W, plan, queryState = None, scaledWordCounts=None, XAT = None): ''' Determines the most likely topic memberships for the given documents as described by their feature and word matrices X and W. All elements of the model are kept fixed. The query state, if provied, will be mutated in-place, so one should make a defensive copy if this behaviour is undesirable. Parameters modelState - the model used to assign topics to documents. This is kept fixed X - the DxF matrix of feature-vectors associated with the documents W - The DxT matrix of word-count vectors representing the documents plan - How to execute the query queryState - the query-state object, with initial topic assignments. The members of this are directly mutatated. scaledWordCounts - a DxT matrix with the same number of non-zero entries as W. This is overwritten. XAT - the product of X.dot(modelState.A.T) Returns The original query state, with the mutated in-place matrices ''' K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate if queryState is None: queryState = newVbQueryState(W, K) expLmda, nu, lxi, s, docLen = queryState.expLmda, queryState.nu, queryState.lxi, queryState.s, queryState.docLen overTsq, overSsq, overAsq, overKsq = 1./tauSq, 1./sigmaSq, 1./alphaSq, 1./kappaSq if W.dtype.kind == 'i': # for the sparseScalorQuotientOfDot() method to work W = W.astype(DTYPE) if scaledWordCounts is None: scaledWordCounts = W.copy() if XAT is None: XAT = X.dot(A.T) # Set up a method to check at every update if we're going in the right # direction verify_and_log = _quickPrintElbo if DEBUG else _doNothing for iteration in range(iterations): # sc = W / lmda.dot(vocab) scaledWordCounts = sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts) # expLmdaCopy = expLmda.copy() rho = 2 * s[:,np.newaxis] * lxi - 0.5 \ + expLmda * (scaledWordCounts.dot(vocab.T)) / docLen[:,np.newaxis] rhs = docLen[:,np.newaxis] * rho + overSsq * XAT expLmda[:] = rhs / (docLen[:,np.newaxis] * 2 * lxi + overSsq) # Note we haven't applied np.exp() yet, we're holding off till we've evaluated the next few terms verify_and_log ("E-Step: q(Theta) [Mean]", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen) # xi_dk # lxi[:] = negJakkolaOfDerivedXi(expLmda, nu, s) verify_and_log ("E-Step: A(xi_dk)", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen) # s_d # s[:] = (K/4. - 0.5 + (lxi * expLmda).sum(axis = 1)) / lxi.sum(axis=1) verify_and_log ("E-Step: s_d", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen) # nu_dk # nu[:] = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + overSsq) verify_and_log ("E-Step: q(Theta) [Var] ", iteration, X, W, K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, expLmda, None, nu, lxi, s, docLen) # Now finally we finish off the estimate of exp(lmda) np.exp(expLmda, out=expLmda) return VbSideTopicQueryState(expLmda, nu, lxi, s, docLen)
def train(modelState, X, W, plan): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements K, Q, F, P, T, A, _, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate queryPlan = newInferencePlan(1, epsilon, logFrequency = 0, plot=False) if W.dtype.kind == 'i': # for the sparseScalorQuotientOfDot() method to work W = W.astype(DTYPE) # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...) if logCount > 0: multiStepSize = np.power (iterations, 1. / logCount) logIter = 1 elbos = [] likes = [] iters = [] else: logIter = iterations + 1 lastVarBoundValue = -sys.float_info.max # We'll need the total word count per doc, and total count of docs docLen = np.squeeze(np.asarray (W.sum(axis=1))) # Force to a one-dimensional array for np.newaxis trick to work D = len(docLen) print ("Training %d topic model with %d x %d word-matrix W, %d x %d feature matrix X, and latent feature and topics spaces of size %d and %d respectively" % (K, D, T, D, F, P, Q)) # No need to recompute this every time XTX = X.T.dot(X) # Identity matrices that occur I_P = ssp.eye(P, dtype=DTYPE) I_Q = ssp.eye(Q, dtype=DTYPE) I_QP = ssp.eye(Q*P,Q*P, dtype=DTYPE) # Assign initial values to the query parameters expLmda = np.exp(rd.random((D, K)).astype(DTYPE)) nu = np.ones((D, K), DTYPE) s = np.zeros((D,), DTYPE) lxi = negJakkola (np.ones((D,K), DTYPE)) # If we don't bother optimising either tau or sigma we can just do all this here once only overSsq = 1. / sigmaSq overAsq = 1. / alphaSq overKsq = 1. / kappaSq overTsq = 1. / tauSq varRatio = (alphaSq * sigmaSq) / (tauSq * kappaSq) # TODO the inverse being almost always dense means that it might # be faster to convert to dense and use the normal solver, despite # the size constraints. # varA = 1./K * sla.inv (overTsq * I_F + overSsq * XTX) print ("Inverting gram matrix") aI_XTX = (overAsq * ssp.eye(F, dtype=DTYPE) + XTX).todense() omA = la.inv (aI_XTX) scaledWordCounts = W.copy() # Set up a method to check at every update if we're going in the right # direction verify_and_log = _quickPrintElbo if DEBUG else _doNothing print ("Launching inference") for iteration in range(iterations): # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z).... # Where lambda is the posterior mean of theta. # ============================================================= # Y, sigY, omY # # If U'U is invertible, use inverse to convert Y to a Sylvester eqn # which has a much, much faster solver. Recall update for Y is of the form # Y + AYB = C where A = U'U, B = V'V and C=U'AV # VTV = V.T.dot(V) UTU = U.T.dot(U) sigY = la.inv(overTsq * overKsq * I_Q + overAsq * overSsq * UTU) verify_and_log ("E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) omY = la.inv(overTsq * overKsq * I_P + overAsq * overSsq * VTV) verify_and_log ("E-Step: q(Y) [omY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) try: invUTU = la.inv(UTU) Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V)) except np.linalg.linalg.LinAlgError as e: # U'U seems to rapidly become singular (before 5 iters) if fastButInaccurate: invUTU = la.pinvh(UTU) # Obviously unstable, inference stalls much earlier than the correct form Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V)) else: Y = np.reshape (la.solve(varRatio * I_QP + np.kron(VTV, UTU), vec(U.T.dot(A).dot(V))), (Q,P), 'F') verify_and_log ("E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # A # # So it's normally A = (UYV' + L'X) omA with omA = inv(t*I_F + s*XTX) # so A inv(omA) = UYV' + L'X # so inv(omA)' A' = VY'U' + X'L # at which point we can use a built-in solve # lmda = np.log(expLmda, out=expLmda) A = omA.dot(X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T # A = la.solve(aI_XTX, X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T np.exp(expLmda, out=expLmda) verify_and_log ("E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # lmda_dk, nu_dk, s_d, and xi_dk # XAT = X.dot(A.T) # query (VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \ # X, W, \ # queryPlan, \ # VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \ # scaledWordCounts=scaledWordCounts, \ # XAT = XAT) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U and V # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # vocab # sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts) factor = (scaledWordCounts.T.dot(expLmda)).T # Gets materialized as a dense matrix... vocab *= factor normalizerows_ip(vocab) # A hack to work around the fact that we've got no prior, and thus no # pseudo counts, so some values will collapse to zero # vocab[vocab < sys.float_info.min] = sys.float_info.min verify_and_log ("M-Step: vocab", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # U # U = A.dot(V).dot(Y.T).dot (la.inv(Y.dot(V.T).dot(V).dot(Y.T) + np.trace(omY.dot(V.T).dot(V)) * sigY)) verify_and_log ("M-Step: U", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # V # V = A.T.dot(U).dot(Y).dot (la.inv(Y.T.dot(U.T).dot(U).dot(Y) + np.trace(sigY.dot(U.T).dot(U)) * omY)) verify_and_log ("M-Step: V", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # ============================================================= # Handle logging of variational bound, likelihood, etc. # ============================================================= if iteration == logIter: modelState = VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq) queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen) elbo = varBound (modelState, queryState, X, W, None, XAT, XTX) likely = log_likelihood(modelState, X, W, queryState) #recons_error(modelState, X, W, queryState) elbos.append (elbo) iters.append (iteration) likes.append (likely) print ("\nIteration %5d ELBO %15f Log-Likelihood %15f" % (iteration, elbo, likely)) logIter = min (np.ceil(logIter * multiStepSize), iterations - 1) if elbo < lastVarBoundValue: sys.stderr.write('ELBO going in the wrong direction\n') elif abs(elbo - lastVarBoundValue) < epsilon: break lastVarBoundValue = elbo if plot and plotIncremental: plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes)) else: print('.', end='') sys.stdout.flush() # Right before we end, plot the evoluation of the bound and likelihood # if we've been asked to do so. if plot: plot_bound(plotFile, iters, elbos, likes) return VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \ VbSideTopicQueryState (expLmda, nu, lxi, s, docLen)
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in repeatedly. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, T, F = W.shape[0], W.shape[1], X.shape[1] means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) Log2Pi = log(2 * pi) bound = 0 # U and V are parameters with no distribution # # Y has a normal distribution, it's covariance is unfortunately an expensive computation # P, Q = U.shape[1], V.shape[1] covY = np.eye(P * Q) * (lfv * ltv) covY += np.kron(V.T.dot(V), U.T.dot(U)) covY = la.inv(covY, overwrite_a=True) # The expected likelihood of Y bound -= 0.5 * P * Q * Log2Pi bound -= 0.5 * P * Q * log(ltv * lfv) bound -= 0.5 / (lfv * ltv) * np.sum( Y * Y) # 5x faster than np.trace(Y.dot(Y.T)) bound -= 0.5 * np.trace(covY) * (lfv * ltv) # the traces of the posterior+prior covariance products cancel out across likelihoods # The entropy of Y bound += 0.5 * P * Q * (Log2Pi + 1) + 0.5 * safe_log_det(covY) # # A has a normal distribution/ # F, K = A.shape[0], A.shape[1] diff = A - U.dot(Y).dot(V.T) diff *= diff # The expected likelihood of A bound -= 0.5 * K * F * Log2Pi bound -= 0.5 * K * F * log(tv * fv) bound -= 0.5 / (fv * tv) * np.sum(diff) # The entropy of A bound += 0.5 * F * K * (Log2Pi + 1) + 0.5 * K * safe_log_det(covA) # # Theta, the matrix of means, has a normal distribution. Its row-covarince is diagonal # (i.e. it's several independent multi-var normal distros). The posterior is made # up of D K-dimensional normals with diagonal covariances # # We iterate through the topics in batches, to control memory use batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) feats = np.ndarray(shape=(batchSize, F), dtype=dtype) tops = np.ndarray(shape=(batchSize, K), dtype=dtype) trace = 0 for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) feats[:batchSize, :] = X[start:end, :].toarray() np.dot(feats[:batchSize, :], A, out=tops[:batchSize, :]) tops[:batchSize, :] -= means[start:end, :] tops[:batchSize, :] *= tops[:batchSize, :] trace += np.sum(tops[:batchSize, :]) feats = None # The expected likelihood of the topic-assignments bound -= 0.5 * D * K * Log2Pi bound -= 0.5 * D * K * log(tv) bound -= 0.5 / tv * trace bound -= 0.5 * tv * np.sum(covA) # this trace doesn't cancel as we # don't have a posterior on tv # The entropy of the topic-assignments bound += 0.5 * D * K * (Log2Pi + 1) + 0.5 * np.sum(covA) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means # Again we batch this for safety batchSize = min(BatchSize, D) batchCount = ceil(D / batchSize) V = np.ndarray(shape=(batchSize, K), dtype=dtype) for b in range(0, batchCount): start = b * batchSize end = min(start + batchSize, D) batchSize = min(batchSize, end - start) meansBatch = means[start:end, :] docLensBatch = docLens[start:end] np.exp(meansBatch - meansBatch.max(axis=1)[:, np.newaxis], out=tops[:batchSize, :]) expMeansBatch = tops[:batchSize, :] R = sparseScalarQuotientOfDot( W, expMeansBatch, vocab, start=start, end=end ) # BatchSize x V: [W / TB] is the quotient of the original over the reconstructed doc-term matrix V[:batchSize, :] = expMeansBatch * (R[:batchSize, :].dot(vocab.T) ) # BatchSize x K VBatch = V[:batchSize, :] bound += np.sum(docLensBatch * np.log(np.sum(expMeansBatch, axis=1))) bound += np.sum( sparseScalarProductOfSafeLnDot(W, expMeansBatch, vocab, start=start, end=end).data) bound += np.sum(meansBatch * VBatch) bound += np.sum(2 * ssp.diags(docLensBatch, 0) * meansBatch.dot(H) * meansBatch) bound -= 2. * scaledSelfSoftDot(meansBatch, docLensBatch) bound -= 0.5 * np.sum(docLensBatch[:, np.newaxis] * VBatch * (np.diag(H))[np.newaxis, :]) bound -= np.sum(meansBatch * VBatch) return bound
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) W,X = data.words, data.feats # Enable logging or not. If enabled, we need the inner product of the feat matrix if debug: XTX = X.T.dot(X) debugFn = _debug_with_bound _debug_with_bound.old_bound=0 else: XTX = None debugFn = _debug_with_nothing # Iterate over parameters lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Estimate Z_dvk expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # Update the Means vMat = (2 * s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * 2 * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # to doing a hadamard product for all d debugFn (itr, means, "query-means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(2 * n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "query-varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "query-lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, lxi, s, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W, X = data.words, data.feats assert W.dtype == modelState.dtype assert X.dtype == modelState.dtype D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likeValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 _debug_with_bound.old_bound = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() sigT_regularizer = 0.001 aI_P = 1./lfv * ssp.eye(P, dtype=dtype) tI_F = 1./fv * ssp.eye(F, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX if ssp.issparse(R_A): R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") s.fill(0) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) sigT /= (P+F+D) sigT.flat[::K+1] += sigT_regularizer # Diagonalize it sigT = np.diag(sigT.flat[::K+1]) # and invert it. isigT = np.diag(np.reciprocal(sigT.flat[::K+1])) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Finally update the parameter V V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the mapping from the features to topics A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # do doing a hadamard product for all d debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX) likeValues[bvIdx] = log_likelihood(data, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likeValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 50: lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likeValues) bvIdx += 1 return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likeValues)
def var_bound(data, modelState, queryState, XTX=None): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, X = data.words, data.feats D, _ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.Ab, modelState.dtype # Calculate some implicit variables isigT = la.inv(sigT) lnDetSigT = lnDetOfDiagMat(sigT) verifyProper(lnDetSigT, "lnDetSigT") if XTX is None: XTX = X.T.dot(X) bound = 0 # Distribution over latent space bound -= (P * K) / 2. * LN_OF_2_PI bound -= P * lnDetSigT bound -= K * P * log(lfv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(Y) * Y) bound -= 0.5 * K * np.trace(R_Y) # And its entropy detR_Y = safeDet(R_Y, "R_Y") bound += 0.5 * LN_OF_2_PI_E + P / 2. * lnDetSigT + K / 2. * log(detR_Y) # Distribution over mapping from features to topics diff = (A - Y.dot(V)) bound -= (F * K) / 2. * LN_OF_2_PI bound -= F * lnDetSigT bound -= K * P * log(fv) bound -= 0.5 * np.sum(1. / lfv * isigT.dot(diff) * diff) bound -= 0.5 * K * np.trace(R_A) # And its entropy detR_A = safeDet(R_A, "R_A") bound += 0.5 * LN_OF_2_PI_E + F / 2. * lnDetSigT + K / 2. * log(detR_A) # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * lnDetSigT diff = means - X.dot(A.T) bound -= 0.5 * np.sum(diff.dot(isigT) * diff) bound -= 0.5 * np.sum( varcs * np.diag(isigT)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. bound -= 0.5 * K * np.trace(XTX.dot(R_A)) # And its entropy bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments, and their entropy # and distribution over words. This is re-arranged as we need # means for some parts, and exp(means) for other parts expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot( W, expMeans, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix S = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * S) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(Ab) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * S * (np.diag(Ab))[np.newaxis, :]) bound -= np.sum(means * S) return bound
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # TODO Get ride of this via a command-line param iterations = max(iterations, 100) # Debugging debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # Necessary values isigT = la.inv(sigT) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Counts of topic assignments expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # the variance varcs[:] = 1. / ((n * (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1]) debugFn(itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) # Update the Means rhs = X.dot(A.T).dot(isigT) rhs += S rhs += n[:, np.newaxis] * means.dot(Ab) rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means) # Long version inverses = dict() for d in range(D): if not n[d] in inverses: inverses[n[d]] = la.inv(isigT + n[d] * Ab) lhs = inverses[n[d]] means[d, :] = lhs.dot(rhs[d, :]) debugFn(itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState # query vars altered in-place
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters, boundValues, boundLikes = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG ) # sort needs to be stable in order to be reversible W = W[sortIdx, :] # deep sorted copy X = X[sortIdx, :] means, varcs = means[sortIdx, :], varcs[sortIdx, :] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1. / lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX leastSquares = lambda feats, targets: la.lstsq( feats, targets, lapack_driver="gelsy")[0].T if ssp.issparse( R_A): # dense inverse typically as fast or faster than sparse R_A = to_dense_array( R_A) # inverse and the result is usually dense in any case leastSquares = lambda feats, targets: np.array( [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)]) R_A.flat[::F + 1] += 1. / fv R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigt_diag.fill(0.001) # Iterate over parameters for itr in range(iterations): A = leastSquares(X, means) diff_a_yv = (A - Y.dot(V)) for _ in range(10): #(50 if itr == 0 else 1): # Update the covariance of the prior diff_m_xa = (means - X.dot(A.T)) sigT = 1. / lfv * (Y.dot(Y.T)) sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K + 1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P + D + F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] + isigScale * isigT.flat[::K + 1]) debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Means rhs = X.dot(A.T).dot(isigT) * isigScale rhs += S rhs += docLens[:, np.newaxis] * means.dot(Ab) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) # Faster version? for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale means[start:end, :] = rhs[start:end, :].dot( lhs ) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) # for _ in range(150): # # Finally update the parameter V # V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) # debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the distribution on the latent space # R_Y_base = aI_P + 1 / fv * V.dot(V.T) # R_Y = la.inv(R_Y_base) # debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, # means, varcs, Ab, docLens) # # Y = 1. / fv * A.dot(V.T).dot(R_Y) # debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the mapping from the features to topics # A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) # debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append( var_bound(DataSet(W, feats=X), modelState, queryState, XTX)) boundLikes.append( log_likelihood(DataSet(W, feats=X), modelState, queryState)) boundIters.append(itr) perp = perplexity_from_like(boundLikes[-1], docLens.sum()) print( time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[-1])) if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if len(boundIters) > 2 and boundIters[-1] > 20: lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum()) if lastPerp - perp < 1: break revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort, :] varcs = varcs[revert_sort, :] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) boundLikes = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible W = W[sortIdx,:] # deep sorted copy X = X[sortIdx,:] means, varcs = means[sortIdx,:], varcs[sortIdx,:] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1./lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") diff_m_xa = (means-X.dot(A.T)) means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa) expMeans = np.zeros((BatchSize, K), dtype=dtype) R = np.zeros((BatchSize, K), dtype=dtype) S = np.zeros((BatchSize, K), dtype=dtype) vocabScale = np.ones(vocab.shape, dtype=dtype) # Iterate over parameters batchIter = 0 for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += means_cov_with_x_a sigT.flat[::K+1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P+D+F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary vocab *= vocabScale vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Finally update the parameter V V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # # And now this is the E-Step # # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the mapping from the features to topics A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Faster version? vocabScale[:,:] = 0 means_cov_with_x_a[:,:] = 0 for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale for d in range(start, end, BatchSize): end_d = min(d + BatchSize, end) span = end_d - d expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab) S[:span,:] = expMeans[:span, :] * R.dot(vocab.T) # Convert expMeans to a softmax(means) expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis] mu = X[d:end_d,:].dot(A.T) rhs = mu.dot(isigT) * isigScale rhs += S[:span,:] rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab) rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means) means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R) stepSize = (Tau + batchIter) ** -Kappa batchIter += 1 # Do a gradient update of the vocab vocabScale += (R.T.dot(expMeans[:span,:])).T # vocabScale *= vocab # normalizerows_ip(vocabScale) # # vocabScale += vocabPrior # vocabScale *= stepSize # vocab *= (1 - stepSize) # vocab += vocabScale diff = (means[d:end_d,:] - mu) means_cov_with_x_a += diff.T.dot(diff) # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX) boundLikes[bvIdx] = log_likelihood(DataSet(W, feats=X), modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 20: lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx) break bvIdx += 1 revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort,:] varcs = varcs[revert_sort,:] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W, L, X = data.words, data.links, data.feats D, _ = W.shape means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A # Calculate some implicit variables itopicCov = la.inv(topicCov) bound = 0 expMeansOut = np.exp(means - means.max(axis=1)[:, np.newaxis]) expMeansIn = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = expMeansIn.sum(axis=0) if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(topicCov) bound += 0.5 * NIW_PSI * np.trace(itopicCov) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1. / pseudoObsMeans) * safe_log_det(topicCov) bound -= 0.5 / pseudoObsMeans * ( topicMean).T.dot(itopicCov).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(topicCov) # + a constant # Distribution over document topics bound -= (D * K) / 2. * LN_OF_2_PI bound -= D / 2. * la.det(topicCov) diff = means - topicMean[np.newaxis, :] bound -= 0.5 * np.sum(diff.dot(itopicCov) * diff) bound -= 0.5 * np.sum( varcs * np.diag(itopicCov)[np.newaxis, :] ) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy, and similaarly for out-links. This is somewhat jumbled to # avoid repeatedly taking the exp and log of the means W_weights = sparseScalarQuotientOfDot( W, expMeansOut, vocab ) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix w_top_sums = expMeansOut * (W_weights.dot(vocab.T)) # D x K L_weights = sparseScalarQuotientOfNormedDot(L, expMeansOut, expMeansIn, lse_at_k) l_top_sums = L_weights.dot(expMeansIn) / lse_at_k[ np.newaxis, :] * expMeansOut bound += np.sum(docLens * np.log(np.sum(expMeansOut, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeansOut, vocab).data) # means = np.log(expMeans, out=expMeans) #means = safe_log(expMeansOut, out=means) bound += np.sum(means * w_top_sums) bound += np.sum(2 * ssp.diags(docLens, 0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:, np.newaxis] * w_top_sums * (np.diag(A))[np.newaxis, :]) bound -= np.sum(means * w_top_sums) return bound
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, T = W.shape F = X.shape[1] # tmpNumDense = np.array([ # 4 , 8 , 2 , 0 , 0, # 0 , 6 , 0 , 17, 0, # 12 , 13 , 1 , 7 , 8, # 0 , 5 , 0 , 0 , 0, # 0 , 6 , 0 , 0 , 44, # 0 , 7 , 2 , 0 , 0], dtype=np.float64).reshape((6,5)) # tmpNum = ssp.csr_matrix(tmpNumDense) # # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10 # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64) # # tmpResult = tmpNum.copy() # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight) # # print (str(tmpNum.todense())) # print (str(tmpDenomleft.dot(tmpDenomRight))) # print (str(tmpResult.todense())) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv # turn variances into precisions # FIXME Use passed in hypers print("tp = %f tv=%f" % (tp, tv)) vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype) # FIXME undo truncation F = 363 A = A[:F, :] X = X[:, :F] U = U[:F, :] data = DataSet(words=W, feats=X) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables if covA is None: precA = (fp * ssp.eye(F) + X.T.dot(X)).todense() # As the inverse is almost always dense covA = la.inv(precA, overwrite_a=True) # it's faster to densify in advance uniqLens = np.unique(docLens) debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) expMeans = means.copy() expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy()) lhs = H.copy() rhs = expMeans.copy() Y_rhs = Y.copy() # Iterate over parameters for itr in range(iterations): # Update U, V given A V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T V /= V[0, 0] U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T # Update Y given U, V, A Y_rhs[:, :] = U.T.dot(A).dot(V) Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True) Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True) s = np.outer(Sv, Su).flatten() s += ltv * lfv np.reciprocal(s, out=s) M = Uu.T.dot(Y_rhs).dot(Uv) M *= unvec(s, row_count=M.shape[0]) Y = Uu.dot(M).dot(Uv.T) debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means)) debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # TODO One big sort by size, plus batch it. # Update the Means rhs[:, :] = expMeans rhs *= R.dot(vocab.T) rhs += X.dot(A) * tp rhs += docLens[:, np.newaxis] * means.dot(H) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) for l in uniqLens: inds = np.where(docLens == l)[0] lhs[:, :] = l * H lhs[np.diag_indices_from(lhs)] += tp lhs[:, :] = la.inv(lhs) means[inds, :] = rhs[inds, :].dot( lhs ) # left and right got switched going from vectors to matrices :-/ debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # Standard deviation # DK = means.shape[0] * means.shape[1] # newTp = np.sum(means) # newTp = (-newTp * newTp) # rhs[:,:] = means # rhs *= means # newTp = DK * np.sum(rhs) - newTp # newTp /= DK * (DK - 1) # newTp = min(max(newTp, 1E-36), 1E+36) # tp = 1 / newTp # if itr % logFrequency == 0: # print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,))))) # Update the vocabulary expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min())) # Update the vocab prior # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior) # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean())) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name) queryState = QueryState(means, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \ QueryState(means, expMeans, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def var_bound(data, modelState, queryState): ''' Determines the variational bounds. Values are mutated in place, but are reset afterwards to their initial values. So it's safe to call in a serial manner. ''' # Unpack the the structs, for ease of access and efficiency W = data.words D,_ = W.shape means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A # Calculate some implicit variables isigT = la.inv(sigT) bound = 0 if USE_NIW_PRIOR: pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR # distribution over topic covariance bound -= 0.5 * K * pseudoObsVar * log(NIW_PSI) bound -= 0.5 * K * pseudoObsVar * log(2) bound -= fns.multigammaln(pseudoObsVar / 2., K) bound -= 0.5 * (pseudoObsVar + K - 1) * safe_log_det(sigT) bound += 0.5 * NIW_PSI * np.trace(isigT) # and its entropy # is a constant which we skip # distribution over means bound -= 0.5 * K * log(1./pseudoObsMeans) * safe_log_det(sigT) bound -= 0.5 / pseudoObsMeans * (topicMean).T.dot(isigT).dot(topicMean) # and its entropy bound += 0.5 * safe_log_det(sigT) # + a constant # Distribution over document topics bound -= (D*K)/2. * LN_OF_2_PI bound -= D/2. * la.det(sigT) diff = means - topicMean[np.newaxis,:] bound -= 0.5 * np.sum (diff.dot(isigT) * diff) bound -= 0.5 * np.sum(varcs * np.diag(isigT)[np.newaxis,:]) # = -0.5 * sum_d tr(V_d \Sigma^{-1}) when V_d is diagonal only. # And its entropy # bound += 0.5 * D * K * LN_OF_2_PI_E + 0.5 * np.sum(np.log(varcs)) # Distribution over word-topic assignments and words and the formers # entropy. This is somewhat jumbled to avoid repeatedly taking the # exp and log of the means expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) # D x V [W / TB] is the quotient of the original over the reconstructed doc-term matrix V = expMeans * (R.dot(vocab.T)) # D x K bound += np.sum(docLens * np.log(np.sum(expMeans, axis=1))) bound += np.sum(sparseScalarProductOfSafeLnDot(W, expMeans, vocab).data) bound += np.sum(means * V) bound += np.sum(2 * ssp.diags(docLens,0) * means.dot(A) * means) bound -= 2. * scaledSelfSoftDot(means, docLens) bound -= 0.5 * np.sum(docLens[:,np.newaxis] * V * (np.diag(A))[np.newaxis,:]) bound -= np.sum(means * V) return bound
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix( data.links.T), data.feats D, _ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigT_diag.fill(NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis, :] topicCov = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag) topicCov /= (D + pseudoObsVar - K) else: topicCov = np.cov( means.T) if topicCov.dtype == np.float64 else np.cov( means.T).astype(dtype) topicCov += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(topicCov) topicCov = np.diag(diag) itopicCov = np.diag(1. / diag) else: itopicCov = la.inv(topicCov) debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # print(" topicCov.det = " + str(la.det(topicCov))) # Building Blocks - temporarily replaces means with exp(means) expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * means \ - (1. / (2*D + 2)) * means.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= ( W_weight.T.dot(expMeansRow) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = ( expMeansCol / lse_at_k[np.newaxis, :]).T # FIXME Dupes line in definitino of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Reset the means and use them to calculate the weighted sum of means meanSum = means.sum(axis=0) * in_counts # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1} varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) + np.diagonal(topicCov)) debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Update the Means rhs = w_top_sums.copy() rhs += l_intop_sums rhs += l_outtop_sums rhs += itopicCov.dot(topicMean) rhs += emit_counts[:, np.newaxis] * (means.dot(A) - rowwise_softmax(means)) rhs += in_counts[np.newaxis, :] * F if diagonalPriorCov: raise ValueError("Not implemented") else: for d in range(D): rhs_ = rhs[d, :] + (1. / (4 * D + 4)) * (meanSum - in_counts * means[d, :]) means[d, :] = la.inv(itopicCov + emit_counts[d] * A + np.diag(D * in_counts / (2 * D + 2))).dot(rhs_) if np.any(np.isnan(means[d, :])) or np.any( np.isinf(means[d, :])): pass if np.any(np.isnan( np.exp(means[d, :] - means[d, :].max()))) or np.any( np.isinf(np.exp(means[d, :] - means[d, :].max()))): pass debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME) queryState = QueryState(means, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if False and itr > 100 and abs( perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum()) ) < 1.0: break return \ ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \ QueryState(means, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))