def newQueryState(data, modelState): ''' Creates a new CTM Query state object. This contains all parameters and random variables tied to individual datapoints. Param: data - the dataset of words, features and links of which only words are used in this model modelState - the model state object REturn: A CtmQueryState object ''' K, vocab, dtype = modelState.K, modelState.vocab, modelState.dtype D,T = data.words.shape assert T == vocab.shape[1], "The number of terms in the document-term matrix (" + str(T) + ") differs from that in the model-states vocabulary parameter " + str(vocab.shape[1]) docLens = np.squeeze(np.asarray(data.words.sum(axis=1))) base = normalizerows_ip(rd.random((D,K*2)).astype(dtype)) means = base[:,:K] expMeans = base[:,K:] varcs = np.ones((D,K), dtype=dtype) return QueryState(means, expMeans, varcs, docLens)
def newVbModelState(K, Q, F, P, T, featVar = 0.01, topicVar = 0.01, latFeatVar = 1, latTopicVar = 1): ''' Creates a new model state object for a topic model based on side-information. This state contains all parameters that o§nce trained can be kept fixed for querying. The parameters are K - the number of topics Q - the number of latent topics, Q << K F - the number of features P - the number of latent features in the projected space, P << F T - the number of terms in the vocabulary topicVar - a scalar providing the isotropic covariance of the topic-space featVar - a scalar providing the isotropic covariance of the feature-space latFeatVar - a scalar providing the isotropic covariance of the latent feature-space latTopicVar - a scalar providing the isotropic covariance of the latent topic-space The returned object will contain K, Q, F, P and T and also A - the mean of the KxF matrix mapping F features to K topics. varA - a vector containing the variance over the F features of the distribution over A Y - the latent space which is mixed by U and V into the observed space omY - the row variance of the distribution over Y sigY - the column variance of the distribution over Y U - the KxQ transformation from the K dimensional observed topic space to the Q-dimensional topic space V - the FxP transformation from the F-dimensinal observed features space to the latent P-dimensional feature-space vocab - The K x V matrix of vocabulary distributions. tau - the row variance of A is tau^2 I_K sigma - the variance in the estimation of the topic memberships. lambda ~ N(A'x, sigma^2I) ''' Y = rd.random((Q,P)).astype(DTYPE) omY = latFeatVar * np.identity(P, DTYPE) sigY = latTopicVar * np.identity(Q, DTYPE) sigT = ssp.eye(K, dtype=DTYPE) U = rd.random((K,Q)).astype(DTYPE) V = rd.random((F,P)).astype(DTYPE) A = U.dot(Y).dot(V.T) varA = featVar * np.identity(F, DTYPE) varRatio = (featVar * topicVar) / (latFeatVar * latTopicVar) if varRatio > 1: raise ValueError ("Model will not converge as (featVar * topicVar) / (latFeatVar * latTopicVar)) = " + str(varRatio) + " when it needs to be no more than one.") # Vocab is K word distributions so normalize vocab = normalizerows_ip (rd.random((K, T)).astype(DTYPE)) + sys.float_info.epsilon return VbSideTopicModelState(K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, topicVar, featVar, latTopicVar, latFeatVar)
def newVbModelState(K, F, T, P): ''' Creates a new model state object for a topic model based on side-information. This state contains all parameters that once trained can be kept fixed for querying. The parameters are K - the number of topics F - the number of features P - the number of features in the projected space, P << F T - the number of terms in the vocabulary The returned object will contain K, F, V and P and also A - the mean of the F x K matrix mapping F features to K topics varA - the column variance of the distribution over A tau - the row variance of A is tau^2 I_K V - the mean of the P x K matrix mapping P projected features to K topics varV - the column variance of the distribution over V (the row variance is again tau^2 I_K U - the F x P projection matrix, such that A = UV sigma - the variance in the estimation of the topic memberships lambda ~ N(A'x, sigma^2I) vocab - The K x V matrix of vocabulary distributions. ''' V = rd.random((P, K)) varV = np.identity(P, np.float64) U = rd.random((F, P)) A = U.dot(V) varA = np.identity(F, np.float64) tau = 0.1 sigma = 0.1 # Vocab is K word distributions so normalize vocab = normalizerows_ip (rd.random((K, T))) return VbSideTopicModelState(K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W = data.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] sigT = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag) sigT /= (D + pseudoObsVar - K) else: sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1./ diag) else: isigT = la.inv(sigT) # FIXME Undo debug sigT = np.eye(K) isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # print(" sigT.det = " + str(la.det(sigT))) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + isigT)^{-1} varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT)) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # Update the Means rhs = V.copy() rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :]) # means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def train(modelState, X, W, plan): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements K, Q, F, P, T, A, _, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate queryPlan = newInferencePlan(1, epsilon, logFrequency = 0, plot=False) if W.dtype.kind == 'i': # for the sparseScalorQuotientOfDot() method to work W = W.astype(DTYPE) # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...) if logCount > 0: multiStepSize = np.power (iterations, 1. / logCount) logIter = 1 elbos = [] likes = [] iters = [] else: logIter = iterations + 1 lastVarBoundValue = -sys.float_info.max # We'll need the total word count per doc, and total count of docs docLen = np.squeeze(np.asarray (W.sum(axis=1))) # Force to a one-dimensional array for np.newaxis trick to work D = len(docLen) print ("Training %d topic model with %d x %d word-matrix W, %d x %d feature matrix X, and latent feature and topics spaces of size %d and %d respectively" % (K, D, T, D, F, P, Q)) # No need to recompute this every time XTX = X.T.dot(X) # Identity matrices that occur I_P = ssp.eye(P, dtype=DTYPE) I_Q = ssp.eye(Q, dtype=DTYPE) I_QP = ssp.eye(Q*P,Q*P, dtype=DTYPE) # Assign initial values to the query parameters expLmda = np.exp(rd.random((D, K)).astype(DTYPE)) nu = np.ones((D, K), DTYPE) s = np.zeros((D,), DTYPE) lxi = negJakkola (np.ones((D,K), DTYPE)) # If we don't bother optimising either tau or sigma we can just do all this here once only overSsq = 1. / sigmaSq overAsq = 1. / alphaSq overKsq = 1. / kappaSq overTsq = 1. / tauSq varRatio = (alphaSq * sigmaSq) / (tauSq * kappaSq) # TODO the inverse being almost always dense means that it might # be faster to convert to dense and use the normal solver, despite # the size constraints. # varA = 1./K * sla.inv (overTsq * I_F + overSsq * XTX) print ("Inverting gram matrix") aI_XTX = (overAsq * ssp.eye(F, dtype=DTYPE) + XTX).todense() omA = la.inv (aI_XTX) scaledWordCounts = W.copy() # Set up a method to check at every update if we're going in the right # direction verify_and_log = _quickPrintElbo if DEBUG else _doNothing print ("Launching inference") for iteration in range(iterations): # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z).... # Where lambda is the posterior mean of theta. # ============================================================= # Y, sigY, omY # # If U'U is invertible, use inverse to convert Y to a Sylvester eqn # which has a much, much faster solver. Recall update for Y is of the form # Y + AYB = C where A = U'U, B = V'V and C=U'AV # VTV = V.T.dot(V) UTU = U.T.dot(U) sigY = la.inv(overTsq * overKsq * I_Q + overAsq * overSsq * UTU) verify_and_log ("E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) omY = la.inv(overTsq * overKsq * I_P + overAsq * overSsq * VTV) verify_and_log ("E-Step: q(Y) [omY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) try: invUTU = la.inv(UTU) Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V)) except np.linalg.linalg.LinAlgError as e: # U'U seems to rapidly become singular (before 5 iters) if fastButInaccurate: invUTU = la.pinvh(UTU) # Obviously unstable, inference stalls much earlier than the correct form Y = la.solve_sylvester (varRatio * invUTU, VTV, invUTU.dot(U.T).dot(A).dot(V)) else: Y = np.reshape (la.solve(varRatio * I_QP + np.kron(VTV, UTU), vec(U.T.dot(A).dot(V))), (Q,P), 'F') verify_and_log ("E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # A # # So it's normally A = (UYV' + L'X) omA with omA = inv(t*I_F + s*XTX) # so A inv(omA) = UYV' + L'X # so inv(omA)' A' = VY'U' + X'L # at which point we can use a built-in solve # lmda = np.log(expLmda, out=expLmda) A = omA.dot(X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T # A = la.solve(aI_XTX, X.T.dot(lmda) + overAsq * V.dot(Y.T).dot(U.T)).T np.exp(expLmda, out=expLmda) verify_and_log ("E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # lmda_dk, nu_dk, s_d, and xi_dk # XAT = X.dot(A.T) # query (VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \ # X, W, \ # queryPlan, \ # VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \ # scaledWordCounts=scaledWordCounts, \ # XAT = XAT) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U and V # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # vocab # sparseScalarQuotientOfDot(W, expLmda, vocab, out=scaledWordCounts) factor = (scaledWordCounts.T.dot(expLmda)).T # Gets materialized as a dense matrix... vocab *= factor normalizerows_ip(vocab) # A hack to work around the fact that we've got no prior, and thus no # pseudo counts, so some values will collapse to zero # vocab[vocab < sys.float_info.min] = sys.float_info.min verify_and_log ("M-Step: vocab", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # U # U = A.dot(V).dot(Y.T).dot (la.inv(Y.dot(V.T).dot(V).dot(Y.T) + np.trace(omY.dot(V.T).dot(V)) * sigY)) verify_and_log ("M-Step: U", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # V # V = A.T.dot(U).dot(Y).dot (la.inv(Y.T.dot(U.T).dot(U).dot(Y) + np.trace(sigY.dot(U.T).dot(U)) * omY)) verify_and_log ("M-Step: V", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq, None, expLmda, nu, lxi, s, docLen) # ============================================================= # Handle logging of variational bound, likelihood, etc. # ============================================================= if iteration == logIter: modelState = VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq) queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen) elbo = varBound (modelState, queryState, X, W, None, XAT, XTX) likely = log_likelihood(modelState, X, W, queryState) #recons_error(modelState, X, W, queryState) elbos.append (elbo) iters.append (iteration) likes.append (likely) print ("\nIteration %5d ELBO %15f Log-Likelihood %15f" % (iteration, elbo, likely)) logIter = min (np.ceil(logIter * multiStepSize), iterations - 1) if elbo < lastVarBoundValue: sys.stderr.write('ELBO going in the wrong direction\n') elif abs(elbo - lastVarBoundValue) < epsilon: break lastVarBoundValue = elbo if plot and plotIncremental: plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes)) else: print('.', end='') sys.stdout.flush() # Right before we end, plot the evoluation of the bound and likelihood # if we've been asked to do so. if plot: plot_bound(plotFile, iters, elbos, likes) return VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq), \ VbSideTopicQueryState (expLmda, nu, lxi, s, docLen)
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, T = W.shape F = X.shape[1] # tmpNumDense = np.array([ # 4 , 8 , 2 , 0 , 0, # 0 , 6 , 0 , 17, 0, # 12 , 13 , 1 , 7 , 8, # 0 , 5 , 0 , 0 , 0, # 0 , 6 , 0 , 0 , 44, # 0 , 7 , 2 , 0 , 0], dtype=np.float64).reshape((6,5)) # tmpNum = ssp.csr_matrix(tmpNumDense) # # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10 # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64) # # tmpResult = tmpNum.copy() # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight) # # print (str(tmpNum.todense())) # print (str(tmpDenomleft.dot(tmpDenomRight))) # print (str(tmpResult.todense())) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \ modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv # turn variances into precisions # FIXME Use passed in hypers print("tp = %f tv=%f" % (tp, tv)) vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype) # FIXME undo truncation F = 363 A = A[:F, :] X = X[:, :F] U = U[:F, :] data = DataSet(words=W, feats=X) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables if covA is None: precA = (fp * ssp.eye(F) + X.T.dot(X)).todense() # As the inverse is almost always dense covA = la.inv(precA, overwrite_a=True) # it's faster to densify in advance uniqLens = np.unique(docLens) debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) expMeans = means.copy() expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy()) lhs = H.copy() rhs = expMeans.copy() Y_rhs = Y.copy() # Iterate over parameters for itr in range(iterations): # Update U, V given A V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T V /= V[0, 0] U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T # Update Y given U, V, A Y_rhs[:, :] = U.T.dot(A).dot(V) Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True) Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True) s = np.outer(Sv, Su).flatten() s += ltv * lfv np.reciprocal(s, out=s) M = Uu.T.dot(Y_rhs).dot(Uv) M *= unvec(s, row_count=M.shape[0]) Y = Uu.dot(M).dot(Uv.T) debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means)) debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # TODO One big sort by size, plus batch it. # Update the Means rhs[:, :] = expMeans rhs *= R.dot(vocab.T) rhs += X.dot(A) * tp rhs += docLens[:, np.newaxis] * means.dot(H) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) for l in uniqLens: inds = np.where(docLens == l)[0] lhs[:, :] = l * H lhs[np.diag_indices_from(lhs)] += tp lhs[:, :] = la.inv(lhs) means[inds, :] = rhs[inds, :].dot( lhs ) # left and right got switched going from vectors to matrices :-/ debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # Standard deviation # DK = means.shape[0] * means.shape[1] # newTp = np.sum(means) # newTp = (-newTp * newTp) # rhs[:,:] = means # rhs *= means # newTp = DK * np.sum(rhs) - newTp # newTp /= DK * (DK - 1) # newTp = min(max(newTp, 1E-36), 1E+36) # tp = 1 / newTp # if itr % logFrequency == 0: # print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,))))) # Update the vocabulary expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min())) # Update the vocab prior # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior) # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean())) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name) queryState = QueryState(means, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \ QueryState(means, expMeans, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W, X = data.words, data.feats assert W.dtype == modelState.dtype assert X.dtype == modelState.dtype D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likeValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 _debug_with_bound.old_bound = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() sigT_regularizer = 0.001 aI_P = 1./lfv * ssp.eye(P, dtype=dtype) tI_F = 1./fv * ssp.eye(F, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX if ssp.issparse(R_A): R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") s.fill(0) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) sigT /= (P+F+D) sigT.flat[::K+1] += sigT_regularizer # Diagonalize it sigT = np.diag(sigT.flat[::K+1]) # and invert it. isigT = np.diag(np.reciprocal(sigT.flat[::K+1])) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Finally update the parameter V V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the mapping from the features to topics A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # do doing a hadamard product for all d debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX) likeValues[bvIdx] = log_likelihood(data, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likeValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 50: lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likeValues) bvIdx += 1 return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likeValues)
def train(modelState, X, W, iterations=10000, epsilon=0.001, logInterval = 0): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. iterations - how long to iterate for epsilon - currently ignored, in future, allows us to stop early. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab) = (modelState.K, modelState.F, modelState.T, modelState.P, modelState.A, modelState.varA, modelState.V, modelState.varV, modelState.U, modelState.sigma, modelState.tau, modelState.vocab) # Get ready to plot the evolution of the likelihood if logInterval > 0: elbos = np.zeros((iterations / logInterval,)) iters = np.zeros((iterations / logInterval,)) # We'll need the total word count per doc, and total count of docs docLen = W.sum(axis=1) D = len(docLen) # No need to recompute this every time XTX = X.T.dot(X) # Assign initial values to the query parameters lmda = rd.random((D, K)) nu = np.ones((D,K), np.float64) s = np.zeros((D,)) lxi = negJakkola (np.ones((D, K), np.float64)) XA = X.dot(A) for iteration in range(iterations): # Save repeated computation tsq = tau * tau; tsqIP = tsq * np.eye(P) trTsqIK = K * tsq # trace of the matrix tau * tau * np.eye(K) halfSig2 = 1./(sigma*sigma) tau2sig2 = (tau * tau) / (sigma * sigma) # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|V) q(V) # Where lambda is the posterior mean of theta. # ============================================================= # # V, varV varV = la.inv (tsqIP + U.T.dot(U)) V = varV.dot(U.T).dot(A) _quickPrintElbo ("E-Step: q(V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # A, varA # TODO, since only tau2sig2 changes at each step, would it be possible just to # amend the old inverse? # TODO Use sparse inverse varA = la.inv (tau2sig2 * XTX + np.eye(F)) A = varA.dot (U.dot(V) + X.T.dot(lmda)) XA = X.dot(A) _quickPrintElbo ("E-Step: q(A|V)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # lmda_dk lnVocab = safe_log (vocab) Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxT rho = 2 * s[:,np.newaxis] * lxi - 0.5 \ + np.einsum('dt,dkt->dk', W, Z) / docLen[:,np.newaxis] rhs = docLen[:,np.newaxis] * rho + halfSig2 * X.dot(A) lmda = rhs / (docLen[:,np.newaxis] * 2 * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;lamda)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # nu_dk # TODO Double check this again... nu = 1./ np.sqrt(2. * docLen[:, np.newaxis] * lxi + halfSig2) _quickPrintElbo ("E-Step: q(Theta|A;nu)", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # # s_d # s = (K/4. + (lxi * lmda).sum(axis = 1)) / lxi.sum(axis=1) # _quickPrintElbo ("M-Step: max s", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # xi_dk lxi = negJakkolaOfDerivedXi(lmda, nu, s) _quickPrintElbo ("M-Step: max xi", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # vocab # # TODO, since vocab is in the RHS, is there any way to optimize this? Z = rowwise_softmax (lmda[:,:,np.newaxis] + lnVocab[np.newaxis,:,:]) # Z is DxKxV vocab = normalizerows_ip (np.einsum('dt,dkt->kt', W, Z)) _quickPrintElbo ("M-Step: max vocab", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # U U = A.dot(V.T).dot (la.inv(trTsqIK * varV + V.dot(V.T))) _quickPrintElbo ("M-Step: max U", iteration, X, W, K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab, lmda, nu, lxi, s, docLen) # # sigma # Equivalent to \frac{1}{DK} \left( \sum_d (\sum_k nu_{dk}) + tr(\Omega_A) x_d^{T} \Sigma_A x_d + (\lambda - A^{T} x_d)^{T}(\lambda - A^{T} x_d) \right) # # sigma = 1./(D*K) * (np.sum(nu) + D*K * tsq * np.sum(XTX * varA) + np.sum((lmda - XA)**2)) # # tau # Equivalent to \frac{1}{KF} \left( tr(\Sigma_A)tr(\Omega_A) + tr(\Sigma_V U U^{T})tr(\Omega_V) + tr ((M_A - U M_V)^{T} (M_A - U M_V)) \right) # varA_U = varA.dot(U) # tau_term1 = np.trace(varA)*K*tsq # tau_term2 = sum(varA_U[p,:].dot(U[p,:]) for p in xrange(P)) * K * tsq # tau_term3 = np.sum((A - U.dot(V)) ** 2) # # tau = 1./(K*F) * (tau_term1 + tau_term2 + tau_term3) if (logInterval > 0) and (iteration % logInterval == 0): elbo = varBound ( \ VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState(lmda, nu, lxi, s, docLen), X, W, Z, lnVocab, varA_U, XA, XTX) elbos[iteration / logInterval] = elbo iters[iteration / logInterval] = iteration print ("Iteration %5d ELBO %f" % (iteration, elbo)) if logInterval > 0: plot_bound(iters, elbos) return (VbSideTopicModelState (K, F, T, P, A, varA, V, varV, U, sigma, tau, vocab), \ VbSideTopicQueryState (lmda, nu, lxi, s, docLen))
def _testOnModelHandcraftedData(self): # # Create the vocab # T = 3 * 3 K = 5 # Horizontal bars vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3, 3)).todense() #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense() vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3, 3)).todense() # Vertical bars vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3, 3)).todense() #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense() vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3, 3)).todense() # Diagonals vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3, 3)).todense() #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense() # Put together T = vocab1.shape[0] * vocab1.shape[1] vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7] # Create a single matrix with the flattened vocabularies vocabVectors = [] for vocab in vocabs: vocabVectors.append(np.squeeze(np.asarray(vocab.reshape((1, T))))) vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE)) # Plot the vocab ones = np.ones(vocabs[0].shape) for k in range(K): plt.subplot(2, 3, k) plt.imshow(ones - vocabs[k], interpolation="none", cmap=cm.Greys_r) plt.show() # # Create the corpus # rd.seed(0xC0FFEE) D = 1000 # Make sense (of a sort) of this by assuming that these correspond to # Kittens Omelettes Puppies Oranges Tomatoes Dutch People Basketball Football #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25]) # topicCovar = np.array(\ # [[ 100, 5, 55, 20, 5, 15, 4, 0], \ # [ 5, 100, 5, 10, 70, 5, 0, 0], \ # [ 55, 5, 100, 5, 5, 10, 0, 5], \ # [ 20, 10, 5, 100, 30, 30, 20, 10], \ # [ 5, 70, 5, 30, 100, 0, 0, 0], \ # [ 15, 5, 10, 30, 0, 100, 10, 40], \ # [ 4, 0, 0, 20, 0, 10, 100, 20], \ # [ 0, 0, 5, 10, 0, 40, 20, 100]], dtype=DTYPE) / 100.0 topicMean = np.array([25, 15, 40, 5, 15]) self.assertEqual(100, topicMean.sum()) topicCovar = np.array(\ [[ 100, 5, 55, 20, 5 ], \ [ 5, 100, 5, 10, 70 ], \ [ 55, 5, 100, 5, 5 ], \ [ 20, 10, 5, 100, 30 ], \ [ 5, 70, 5, 30, 100 ], \ ], dtype=DTYPE) / 100.0 meanWordCount = 80 wordCounts = rd.poisson(meanWordCount, size=D) topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D) W = topicDists.dot(vocab) * wordCounts[:, np.newaxis] W = ssp.csr_matrix(W.astype(DTYPE)) # # Train the model # model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=65, logFrequency=1) self.assertTrue(0.99 < np.sum(model.topicMean) < 1.01) return self._doTest(W, model, queryState, trainPlan)
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters, boundValues, boundLikes = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG ) # sort needs to be stable in order to be reversible W = W[sortIdx, :] # deep sorted copy X = X[sortIdx, :] means, varcs = means[sortIdx, :], varcs[sortIdx, :] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1. / lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX leastSquares = lambda feats, targets: la.lstsq( feats, targets, lapack_driver="gelsy")[0].T if ssp.issparse( R_A): # dense inverse typically as fast or faster than sparse R_A = to_dense_array( R_A) # inverse and the result is usually dense in any case leastSquares = lambda feats, targets: np.array( [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)]) R_A.flat[::F + 1] += 1. / fv R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigt_diag.fill(0.001) # Iterate over parameters for itr in range(iterations): A = leastSquares(X, means) diff_a_yv = (A - Y.dot(V)) for _ in range(10): #(50 if itr == 0 else 1): # Update the covariance of the prior diff_m_xa = (means - X.dot(A.T)) sigT = 1. / lfv * (Y.dot(Y.T)) sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K + 1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P + D + F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary vocab *= ( R.T.dot(expMeans) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] + isigScale * isigT.flat[::K + 1]) debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Means rhs = X.dot(A.T).dot(isigT) * isigScale rhs += S rhs += docLens[:, np.newaxis] * means.dot(Ab) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) # Faster version? for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale means[start:end, :] = rhs[start:end, :].dot( lhs ) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) # for _ in range(150): # # Finally update the parameter V # V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) # debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the distribution on the latent space # R_Y_base = aI_P + 1 / fv * V.dot(V.T) # R_Y = la.inv(R_Y_base) # debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, # means, varcs, Ab, docLens) # # Y = 1. / fv * A.dot(V.T).dot(R_Y) # debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) # # # Update the mapping from the features to topics # A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) # debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, # varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append( var_bound(DataSet(W, feats=X), modelState, queryState, XTX)) boundLikes.append( log_likelihood(DataSet(W, feats=X), modelState, queryState)) boundIters.append(itr) perp = perplexity_from_like(boundLikes[-1], docLens.sum()) print( time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[-1])) if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if len(boundIters) > 2 and boundIters[-1] > 20: lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum()) if lastPerp - perp < 1: break revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort, :] varcs = varcs[revert_sort, :] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) boundLikes = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible W = W[sortIdx,:] # deep sorted copy X = X[sortIdx,:] means, varcs = means[sortIdx,:], varcs[sortIdx,:] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1./lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") diff_m_xa = (means-X.dot(A.T)) means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa) expMeans = np.zeros((BatchSize, K), dtype=dtype) R = np.zeros((BatchSize, K), dtype=dtype) S = np.zeros((BatchSize, K), dtype=dtype) vocabScale = np.ones(vocab.shape, dtype=dtype) # Iterate over parameters batchIter = 0 for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += means_cov_with_x_a sigT.flat[::K+1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P+D+F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary vocab *= vocabScale vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Finally update the parameter V V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # # And now this is the E-Step # # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the mapping from the features to topics A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Faster version? vocabScale[:,:] = 0 means_cov_with_x_a[:,:] = 0 for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale for d in range(start, end, BatchSize): end_d = min(d + BatchSize, end) span = end_d - d expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab) S[:span,:] = expMeans[:span, :] * R.dot(vocab.T) # Convert expMeans to a softmax(means) expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis] mu = X[d:end_d,:].dot(A.T) rhs = mu.dot(isigT) * isigScale rhs += S[:span,:] rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab) rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means) means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R) stepSize = (Tau + batchIter) ** -Kappa batchIter += 1 # Do a gradient update of the vocab vocabScale += (R.T.dot(expMeans[:span,:])).T # vocabScale *= vocab # normalizerows_ip(vocabScale) # # vocabScale += vocabPrior # vocabScale *= stepSize # vocab *= (1 - stepSize) # vocab += vocabScale diff = (means[d:end_d,:] - mu) means_cov_with_x_a += diff.T.dot(diff) # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX) boundLikes[bvIdx] = log_likelihood(DataSet(W, feats=X), modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 20: lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx) break bvIdx += 1 revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort,:] varcs = varcs[revert_sort,:] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def train(modelState, X, W, plan): ''' Creates a new query state object for a topic model based on side-information. This contains all those estimated parameters that are specific to the actual date being queried - this must be used in conjunction with a model state. The parameters are modelState - the model state with all the model parameters X - the D x F matrix of side information vectors W - the D x V matrix of word **count** vectors. iterations - how long to iterate for epsilon - currently ignored, in future, allows us to stop early. logInterval - the interval between iterations where we calculate and display the log-likelihood bound plotInterval - the interval between iterations we we display the log-likelihood bound values calcuated at each log-interval fastButInaccurate - if true, we may use a psedo-inverse instead of an inverse when solving for Y when the true inverse is unavailable. This returns a tuple of new model-state and query-state. The latter object will contain X and W and also s - A D-dimensional vector describing the offset in our bound on the true value of ln sum_k e^theta_dk lxi - A DxK matrix used in the above bound, containing the negative Jakkola function applied to the quadratic term xi lambda - the topics we've inferred for the current batch of documents nu - the variance of topics we've inferred (independent) ''' # Unpack the model state tuple for ease of use and maybe speed improvements K, Q, F, P, T, A, varA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq = modelState.K, modelState.Q, modelState.F, modelState.P, modelState.T, modelState.A, modelState.varA, modelState.Y, modelState.omY, modelState.sigY, modelState.sigT, modelState.U, modelState.V, modelState.vocab, modelState.topicVar, modelState.featVar, modelState.lowTopicVar, modelState.lowFeatVar iterations, epsilon, logCount, plot, plotFile, plotIncremental, fastButInaccurate = plan.iterations, plan.epsilon, plan.logFrequency, plan.plot, plan.plotFile, plan.plotIncremental, plan.fastButInaccurate mu0 = 0.0001 if W.dtype.kind == 'i': # for the sparseScalorQuotientOfDot() method to work W = W.astype(DTYPE) # Get ready to plot the evolution of the likelihood, with multiplicative updates (e.g. 1, 2, 4, 8, 16, 32, ...) if logCount > 0: multiStepSize = np.power(iterations, 1. / logCount) logIter = 1 elbos = [] likes = [] iters = [] else: logIter = iterations + 1 lastVarBoundValue = -sys.float_info.max # We'll need the total word count per doc, and total count of docs docLen = np.squeeze( np.asarray(W.sum(axis=1)) ) # Force to a one-dimensional array for np.newaxis trick to work D = len(docLen) # No need to recompute this every time if X.dtype != DTYPE: X = X.astype(DTYPE) XTX = X.T.dot(X) # Identity matrices that occur I_P = ssp.eye(P, P, 0, DTYPE) I_Q = ssp.eye(Q, Q, 0, DTYPE) I_QP = ssp.eye(Q * P, Q * P, 0, DTYPE) I_F = ssp.eye( F, F, 0, DTYPE, "csc" ) # X is CSR, XTX is consequently CSC, sparse inverse requires CSC T_QP = sp_vec_trans_matrix(Y.shape) # Assign initial values to the query parameters expLmda = np.exp(rd.random((D, K)).astype(DTYPE)) nu = np.ones((D, K), DTYPE) s = np.zeros((D, ), DTYPE) lxi = negJakkola(np.ones((D, K), DTYPE)) # If we don't bother optimising either tau or sigma we can just do all this here once only tsq = tauSq ssq = sigmaSq overTsq = 1. / tsq overSsq = 1. / ssq overTsqSsq = 1. / (tsq * ssq) # TODO the inverse being almost always dense means that it might # be faster to convert to dense and use the normal solver, despite # the size constraints. # varA = 1./K * sla.inv (overTsq * I_F + overSsq * XTX) tI_sXTX = (overTsq * I_F + overSsq * XTX).todense() omA = la.inv(tI_sXTX) scaledWordCounts = W.copy() for iteration in range(iterations): # ============================================================= # E-Step # Model dists are q(Theta|A;Lambda;nu) q(A|Y) q(Y) and q(Z).... # Where lambda is the posterior mean of theta. # ============================================================= # Y, sigY, omY # # If U'U is invertible, use inverse to convert Y to a Sylvester eqn # which has a much, much faster solver. Recall update for Y is of the form # Y + AYB = C where A = U'U, B = V'V and C=U'AV # VTV = V.T.dot(V) UTU = U.T.dot(U) sigy = la.inv(I_QP + overTsqSsq * np.kron(VTV, UTU)) _quickPrintElbo("E-Step: q(Y) [sigY]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) Y = mu0 + np.reshape(overTsqSsq * sigy.dot(vec(U.T.dot(A).dot(V))), (Q, P), order='F') _quickPrintElbo("E-Step: q(Y) [Mean]", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) # A # # So it's normally A = (UYV' + L'X) omA with omA = inv(t*I_F + s*XTX) # so A inv(omA) = UYV' + L'X # so inv(omA)' A' = VY'U' + X'L # at which point we can use a built-in solve # # A = (overTsq * U.dot(Y).dot(V.T) + X.T.dot(expLmda).T).dot(omA) lmda = np.log(expLmda, out=expLmda) A = la.solve(tI_sXTX, X.T.dot(lmda) + V.dot(Y.T).dot(U.T)).T np.exp(expLmda, out=expLmda) _quickPrintElbo("E-Step: q(A)", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) # lmda_dk, nu_dk, s_d, and xi_dk # XAT = X.dot(A.T) query (VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma), \ X, W, \ VbSideTopicQueryState(expLmda, nu, lxi, s, docLen), \ scaledWordCounts=scaledWordCounts, \ XAT = XAT, \ iterations=10, \ logInterval = 0, plotInterval = 0) # ============================================================= # M-Step # Parameters for the softmax bound: lxi and s # The projection used for A: U and V # The vocabulary : vocab # The variances: tau, sigma # ============================================================= # U # try: U = A.dot(V).dot(Y.T).dot (la.inv( \ Y.dot(V.T).dot(V).dot(Y.T) \ + (vec_transpose_csr(T_QP, P).T.dot(np.kron(I_QP, VTV)).dot(vec_transpose(T_QP.dot(sigy), P))).T )) except np.linalg.linalg.LinAlgError as e: print(str(e)) print("Ruh-ro") # order of last line above reversed to handle numpy bug preventing dot product from dense to sparse _quickPrintElbo("M-Step: U", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) # V # # Temporarily this requires that we re-order sigY until I've implemented a fortran order # vec transpose in Cython sigY = sigY.T.copy() V = A.T.dot(U).dot(Y).dot (la.inv ( \ Y.T.dot(U.T).dot(U).dot(Y) \ + vec_transpose (sigY, Q).T.dot(np.kron(I_QP, UTU).dot(vec_transpose(I_QP, Q))) \ )) _quickPrintElbo("M-Step: V", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) # vocab # factor = (scaledWordCounts.T.dot(expLmda) ).T # Gets materialized as a dense matrix... vocab *= factor normalizerows_ip(vocab) _quickPrintElbo("M-Step: \u03A6", iteration, X, W, K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, tau, sigma, expLmda, nu, lxi, s, docLen) # ============================================================= # Handle logging of variational bound, likelihood, etc. # ============================================================= if iteration == logIter: modelState = VbSideTopicModelState(K, Q, F, P, T, A, omA, Y, omY, sigY, sigT, U, V, vocab, sigmaSq, alphaSq, kappaSq, tauSq) queryState = VbSideTopicQueryState(expLmda, nu, lxi, s, docLen) elbo = varBound(modelState, queryState, X, W, None, XAT, XTX) likely = log_likelihood( modelState, X, W, queryState) #recons_error(modelState, X, W, queryState) elbos.append(elbo) iters.append(iteration) likes.append(likely) print("Iteration %5d ELBO %15f Log-Likelihood %15f" % (iteration, elbo, likely)) logIter = min(np.ceil(logIter * multiStepSize), iterations - 1) if elbo - lastVarBoundValue < epsilon: break else: lastVarBoundValue = elbo if plot and plotIncremental: plot_bound(plotFile + "-iter-" + str(iteration), np.array(iters), np.array(elbos), np.array(likes)) # Right before we end, plot the evolution of the bound and likelihood # if we've been asked to do so. if plot: plot_bound(plotFile, iters, elbos, likes) return VbSideTopicModelState (K, Q, F, P, T, A, omA, Y, omY, sigY, U, V, vocab, tau, sigma), \ VbSideTopicQueryState (expLmda, nu, lxi, s, docLen)
def train(data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix( data.links.T), data.feats D, _ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype) priorSigT_diag.fill(NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis, :] topicCov = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag) topicCov /= (D + pseudoObsVar - K) else: topicCov = np.cov( means.T) if topicCov.dtype == np.float64 else np.cov( means.T).astype(dtype) topicCov += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(topicCov) topicCov = np.diag(diag) itopicCov = np.diag(1. / diag) else: itopicCov = la.inv(topicCov) debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # print(" topicCov.det = " + str(la.det(topicCov))) # Building Blocks - temporarily replaces means with exp(means) expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * means \ - (1. / (2*D + 2)) * means.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= ( W_weight.T.dot(expMeansRow) ).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = ( expMeansCol / lse_at_k[np.newaxis, :]).T # FIXME Dupes line in definitino of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Reset the means and use them to calculate the weighted sum of means meanSum = means.sum(axis=0) * in_counts # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1} varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) + np.diagonal(topicCov)) debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) # Update the Means rhs = w_top_sums.copy() rhs += l_intop_sums rhs += l_outtop_sums rhs += itopicCov.dot(topicMean) rhs += emit_counts[:, np.newaxis] * (means.dot(A) - rowwise_softmax(means)) rhs += in_counts[np.newaxis, :] * F if diagonalPriorCov: raise ValueError("Not implemented") else: for d in range(D): rhs_ = rhs[d, :] + (1. / (4 * D + 4)) * (meanSum - in_counts * means[d, :]) means[d, :] = la.inv(itopicCov + emit_counts[d] * A + np.diag(D * in_counts / (2 * D + 2))).dot(rhs_) if np.any(np.isnan(means[d, :])) or np.any( np.isinf(means[d, :])): pass if np.any(np.isnan( np.exp(means[d, :] - means[d, :].max()))) or np.any( np.isinf(np.exp(means[d, :] - means[d, :].max()))): pass debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME) queryState = QueryState(means, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if False and itr > 100 and abs( perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum()) ) < 1.0: break return \ ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \ QueryState(means, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))