def testCrossValPerplexityOnRealDataWithLdaGibbsInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(np.int32) # Gibbs expects integers as input, regardless of model dtype data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Training setup TrainSamplesPerTopic = 10 QuerySamplesPerTopic = 2 Thin = 2 Debug = False # Start running experiments topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPlan = lda_gibbs.newTrainPlan(K * TrainSamplesPerTopic, thin=Thin, debug=Debug) queryPlan = lda_gibbs.newTrainPlan(K * QuerySamplesPerTopic, thin=Thin, debug=Debug) trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) estData, evalData = queryData.doc_completion_split() model = lda_gibbs.newModelAtRandom(trainData, K, dtype=dtype) query = lda_gibbs.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda_gibbs.train (trainData, model, query, trainPlan) like = lda_gibbs.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = lda_gibbs.newQueryState(estData, model) _, queryResult = lda_gibbs.query(estData, model, query, queryPlan) like = lda_gibbs.log_likelihood(evalData, model, queryResult) perp = perplexity_from_like(like, evalData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def _debug_with_bound (itr, var_value, var_name, data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, n): if np.isnan(var_value).any(): printStderr ("WARNING: " + var_name + " contains NaNs") if np.isinf(var_value).any(): printStderr ("WARNING: " + var_name + " contains INFs") if "dtype" in dir(var_value) and var_value.dtype != dtype: printStderr ("WARNING: dtype(" + var_name + ") = " + str(var_value.dtype)) model = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, False, dtype, MODEL_NAME) query = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, n) old_bound = _debug_with_bound.old_bound bound = var_bound(data, model, query) diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound) _debug_with_bound.old_bound = bound addendum = "" if var_name == "topicCov": try: addendum = "log det(topicCov) = %g" % (np.log(la.det(topicCov))) except: addendum = "log det(topicCov) = <undefined>" if isnan(bound): printStderr ("Bound is NaN") else: perp = perplexity_from_like(log_likelihood(data, model, query), data.word_count) if int(bound - old_bound) < 0: printStderr ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f) %s" % (itr, var_name, bound, diff, perp, addendum)) else: print ("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f) %s" % (itr, var_name, bound, diff, perp, addendum))
def _debug_with_bound (itr, var_value, var_name, W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n): if np.isnan(var_value).any(): printStderr ("WARNING: " + var_name + " contains NaNs") if np.isinf(var_value).any(): printStderr ("WARNING: " + var_name + " contains INFs") global last addendum = "" if var_name == "sigT": try: addendum = "det(sigT) = %g" % (la.det(sigT)) except: addendum = "det(sigT) = <undefined>" model, query = ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME), QueryState(means, means.copy(), varcs, lxi, s, n) perp = perplexity_from_like(log_likelihood(DataSet(W), model, query), W.sum()) bound = var_bound(DataSet(W), model, query) dif = 0 if last == 0 else last - bound if dif > 0: sys.stdout.flush() sys.stderr.flush() sys.stderr.write("Iter %3d Update %10s Perp %4.2f Bound %.3f (%+.3f) %s\n" % (itr, var_name, perp, bound, dif, addendum)) sys.stderr.flush() else: print ("Iter %3d Update %10s Perp %4.2f Bound %.3f (%+.3f) %s" % (itr, var_name, perp, bound, dif, addendum)) last = bound
def testPerplexityOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 50 model = mtm.newModelAtRandom(data, K, K - 1, dtype=dtype) queryState = mtm.newQueryState(data, model) trainPlan = mtm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = mtm.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() fig, ax1 = plt.subplots() ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r) fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = mtm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = mtm.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print ("Prior %s" % (str(model.topicPrior))) print ("Perplexity: %f\n\n" % perp) for k in range(model.K): print("\nTopic %d\n=============================" % k) print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype debugFn = _debug_with_bound if debug else _debug_with_nothing W = data.words D = W.shape[0] # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) # Update the Variances varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1]) debugFn (0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 R = W.copy() for itr in range(iterations): expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) # Update the Means rhs = V.copy() rhs += n[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d,:] = la.inv(isigT + n[d] * A).dot(rhs[d,:]) debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState
def train (data, model, query, plan): iterations, burnIn, thin, weightUpdateInterval, _, debug = \ plan.iterations, plan.burnIn, plan.thin, plan.weightUpdateInterval, plan.logFrequency, plan.debug w_list, z_list, docLens = \ query.w_list, query.z_list, query.docLens K, T, weights, topicPrior, vocabPrior, _, _, _, dtype, name = \ model.K, model.T, model.weights, model.topicPrior, model.vocabPrior, model.topicSum, model.vocabSum, model.numSamples, model.dtype, model.name assert model.dtype == np.float64, "This is only implemented for 64-bit floats" D = docLens.shape[0] X = data.feats assert docLens.max() < 65536, "This only works for documents with fewer than 65,536 words" ndk = np.zeros((D,K), dtype=np.uint16) nkv = np.zeros((K,T), dtype=np.int32) nk = np.zeros((K,), dtype=np.int32) num_samples = (iterations - burnIn) // thin n_dk_samples = np.zeros((D,K,num_samples), dtype=np.uint16) topicSum = np.zeros((D,K), dtype=dtype) vocabSum = np.zeros((K,T), dtype=dtype) compiled.initGlobalRng(0xC0FFEE) compiled.sumSuffStats(w_list, z_list, docLens, ndk, nkv, nk) # Burn in alphas = X.dot(weights.T) if debug: print ("Burning") compiled.sample (burnIn, burnIn + 1, w_list, z_list, docLens, \ alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \ vocabPrior, False, debug) # True samples if debug: print ("Training") sample_count = 0 for _ in range(0, iterations - burnIn, weightUpdateInterval): alphas[:,:] = X.dot(weights.T) sample_count += compiled.sample (weightUpdateInterval, thin, w_list, z_list, docLens, \ alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \ vocabPrior, False, debug) if debug: # Print out the perplexity so far likely = log_likelihood(data, \ ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \ QueryState (w_list, z_list, docLens, topicSum, sample_count)) perp = perplexity_from_like(likely, data) print ("Sample-Count = %3d Perplexity = %7.2f" % (sample_count, perp)) updateWeights(n_dk_samples, sample_count, X, weights, debug) # compiled.freeGlobalRng() return \ ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \ QueryState (w_list, z_list, docLens, topicSum, sample_count), \ (np.zeros(1), np.zeros(1), np.zeros(1))
def testCrossValPerplexityOnRealDataWithLdaInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False) topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = lda.newModelAtRandom(trainData, K, dtype=dtype) query = lda.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan) like = lda.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) estData, evalData = queryData.doc_completion_split() query = lda.newQueryState(estData, model) model, queryResult = lda.query(estData, model, query, queryPlan) like = lda.log_likelihood(evalData, model, queryResult) perp = perplexity_from_like(like, evalData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def testPerplexityOnRealDataWithLdaInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] perps = [] for K in topicCounts: model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) # vocab = lda.wordDists(model) # topWordCount = 10 # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = lda.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) perps.append(perp) print ("K = %2d : Perplexity = %f\n\n" % (K, perp)) # # for k in range(model.K): # print("\nTopic %d\n=============================" % k) # print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(topicCounts, perps, 'b-') ax1.set_xlabel('Topic Count') ax1.set_ylabel('Perplexity', color='b') fig.show() plt.show()
def link_split_map (data, mdl, sample_model, train_plan, folds, model_dir = None): ''' Train on all the words and half the links. Predict the remaining links. Evaluate using mean average-precision. Cross validation may be used, but note we're always evaluating on training data. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param model_dir: if not none, and folds > 1, the models are stored in this directory. :return: the list of model files stored ''' model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data for fold in range(folds): model = mdl.newModelFromExisting(sample_model) train_data, query_data = data.link_prediction_split(symmetric=False) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) min_link_probs = mdl.min_link_probs(model, train_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs) map = mean_average_prec (query_data.links, predicted_link_probs) print ("Fold %2d: Mean-Average-Precision %6.3f" % (fold, map)) model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) return model_files
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W = data.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN pseudoObsVar = K + NIW_PSEUDO_OBS_VAR priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigT_diag.fill (NIW_PSI) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if USE_NIW_PRIOR: diff = means - topicMean[np.newaxis,:] sigT = diff.T.dot(diff) \ + pseudoObsVar * np.outer(topicMean, topicMean) sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag) sigT /= (D + pseudoObsVar - K) else: sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1./ diag) else: isigT = la.inv(sigT) # FIXME Undo debug sigT = np.eye(K) isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # print(" sigT.det = " + str(la.det(sigT))) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) V = expMeans * R.dot(vocab.T) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances: var_d = (2 N_d * A + isigT)^{-1} varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT)) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) # Update the Means rhs = V.copy() rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean) rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means) if diagonalPriorCov: means = varcs * rhs else: for d in range(D): means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :]) # means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > 100 and len(likelyValues) > 3 \ and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def train (dataset, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W = dataset.words D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likelyValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() s.fill(0) priorSigt_diag = np.ndarray(shape=(K,), dtype=dtype) priorSigt_diag.fill (0.1) kappa = K + 2 expMeans = means.copy() # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior # topicMean = means.mean(axis = 0) topicMean = means.sum(axis=0) / (D + kappa) \ if USE_NIW_PRIOR \ else means.mean(axis=0) debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # diff = means - topicMean # sigT = diff.T.dot(diff) / D sigT, _ = oas(means, assume_centered=False) if dtype is not np.float64: sigT = sigT.astype(dtype) sigT += np.diag(varcs.mean(axis=0)) if USE_NIW_PRIOR: sigT.flat[::K+1] += priorSigt_diag sigT += (kappa * D)/(kappa + D) * np.outer(topicMean, topicMean) # Building blocks... # 1/4 Create the precision matrix from the covariance if True or diagonalPriorCov: diag = np.diag(sigT) sigT = np.diag(diag) isigT = np.diag(1. / diag) else: isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # print (" Det sigT = " + str(la.det(sigT))) # 2/4 temporarily replace means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) # S = expMeans * R.dot(vocab.T) # 3/4 Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # 4/4 Reset the means to their original form, and log effect of vocab update #means = np.log(expMeans, out=expMeans) debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the Variances varcs = np.reciprocal(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + isigT.dot(topicMean) # for d in range(D): # means[d,:] = la.inv(isigT + ssp.diags(n[d] * lxi[d,:], 0)).dot(rhsMat[d,:]) means = varcs * rhsMat means -= (means[:,0])[:,np.newaxis] debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # If so Bouchard's suggested approach of fixing it at zero # #s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) debugFn (itr, s, "s", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(dataset, modelState, queryState) likelyValues[bvIdx] = log_likelihood(dataset, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likelyValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %5d: Perplexity %4.2f Bound %10.2f " % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] >= 30: lastPerp = perplexity_from_like(likelyValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likelyValues) bvIdx += 1 return \ ModelState(K, topicMean, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likelyValues)
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) boundLikes = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # For efficient inference, we need a separate covariance for every unique # document length. For products to execute quickly, the doc-term matrix # therefore needs to be ordered in ascending terms of document length originalDocLens = docLens sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible W = W[sortIdx,:] # deep sorted copy X = X[sortIdx,:] means, varcs = means[sortIdx,:], varcs[sortIdx,:] docLens = originalDocLens[sortIdx] lens, inds = np.unique(docLens, return_index=True) inds = np.append(inds, [W.shape[0]]) # Initialize some working variables R = W.copy() aI_P = 1./lfv * ssp.eye(P, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") diff_m_xa = (means-X.dot(A.T)) means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa) expMeans = np.zeros((BatchSize, K), dtype=dtype) R = np.zeros((BatchSize, K), dtype=dtype) S = np.zeros((BatchSize, K), dtype=dtype) vocabScale = np.ones(vocab.shape, dtype=dtype) # Iterate over parameters batchIter = 0 for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += means_cov_with_x_a sigT.flat[::K+1] += varcs.sum(axis=0) # As small numbers lead to instable inverse estimates, we use the # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these # scales whenever we use the inverse of the unscaled covariance sigScale = 1. / (P+D+F) isigScale = 1. / sigScale isigT = la.inv(sigT) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the vocabulary # vocab *= vocabScale # vocab += vocabPrior # vocab = normalizerows_ip(vocab) # debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Finally update the parameter V V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # # And now this is the E-Step # # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the mapping from the features to topics A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Update the Variances varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) # Faster version? vocabScale[:,:] = 0 means_cov_with_x_a[:,:] = 0 for lenIdx in range(len(lens)): nd = lens[lenIdx] start, end = inds[lenIdx], inds[lenIdx + 1] lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale for d in range(start, end, BatchSize): end_d = min(d + BatchSize, end) span = end_d - d expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab) S[:span,:] = expMeans[:span, :] * R.dot(vocab.T) # Convert expMeans to a softmax(means) expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis] mu = X[d:end_d,:].dot(A.T) rhs = mu.dot(isigT) * isigScale rhs += S[:span,:] rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab) rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means) means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:]) R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R) stepSize = (Tau + batchIter) ** -Kappa batchIter += 1 # Do a gradient update of the vocab vocabScale = (R.T.dot(expMeans[:span,:])).T vocabScale *= vocab normalizerows_ip(vocabScale) # vocabScale += vocabPrior vocabScale *= stepSize vocab *= (1 - stepSize) vocab += vocabScale diff = (means[d:end_d,:] - mu) means_cov_with_x_a += diff.T.dot(diff) # print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max())) debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, docLens) boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX) boundLikes[bvIdx] = log_likelihood(DataSet(W, feats=X), modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvement in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 20: lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx) break bvIdx += 1 revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG) means = means[revert_sort,:] varcs = varcs[revert_sort,:] docLens = docLens[revert_sort] return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, docLens), \ (boundIters, boundValues, boundLikes)
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, X = data.words, data.feats D, _ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype # Debugging debugFn = _debug_with_bound if debug else _debug_with_nothing _debug_with_bound.old_bound = 0 # Necessary values isigT = la.inv(sigT) lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Counts of topic assignments expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # the variance varcs[:] = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1]) debugFn (itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) # Update the Means rhs = X.dot(A.T).dot(isigT) rhs += S rhs += n[:,np.newaxis] * means.dot(Ab) rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means) # Long version inverses = dict() for d in range(D): if not n[d] in inverses: inverses[n[d]] = la.inv(isigT + n[d] * Ab) lhs = inverses[n[d]] means[d,:] = lhs.dot(rhs[d,:]) debugFn (itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, queryState # query vars altered in-place
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want it) A new query object with the update query parameters ''' W, X = data.words, data.feats assert W.dtype == modelState.dtype assert X.dtype == modelState.dtype D,_ = W.shape # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Book-keeping for logs boundIters = np.zeros(shape=(iterations // logFrequency,)) boundValues = np.zeros(shape=(iterations // logFrequency,)) likeValues = np.zeros(shape=(iterations // logFrequency,)) bvIdx = 0 _debug_with_bound.old_bound = 0 debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables isigT = la.inv(sigT) R = W.copy() sigT_regularizer = 0.001 aI_P = 1./lfv * ssp.eye(P, dtype=dtype) tI_F = 1./fv * ssp.eye(F, dtype=dtype) print("Creating posterior covariance of A, this will take some time...") XTX = X.T.dot(X) R_A = XTX if ssp.issparse(R_A): R_A = R_A.todense() # dense inverse typically as fast or faster than sparse inverse R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case R_A = la.inv(R_A) print("Covariance matrix calculated, launching inference") s.fill(0) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the covariance of the prior diff_a_yv = (A-Y.dot(V)) diff_m_xa = (means-X.dot(A.T)) sigT = 1./lfv * (Y.dot(Y.T)) sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T) sigT += diff_m_xa.T.dot(diff_m_xa) sigT.flat[::K+1] += varcs.sum(axis=0) sigT /= (P+F+D) sigT.flat[::K+1] += sigT_regularizer # Diagonalize it sigT = np.diag(sigT.flat[::K+1]) # and invert it. isigT = np.diag(np.reciprocal(sigT.flat[::K+1])) debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Building Blocks - temporarily replaces means with exp(means) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) S = expMeans * R.dot(vocab.T) # Update the vocabulary vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) # Reset the means to their original form, and log effect of vocab update debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Finally update the parameter V V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A)) debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # And now this is the E-Step, though it's followed by updates for the # parameters also that handle the log-sum-exp approximation. # Update the distribution on the latent space R_Y_base = aI_P + 1/fv * V.dot(V.T) R_Y = la.inv(R_Y_base) debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) Y = 1./fv * A.dot(V.T).dot(R_Y) debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the mapping from the features to topics A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A) debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # do doing a hadamard product for all d debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME) queryState = QueryState(means, expMeans, varcs, lxi, s, n) boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX) likeValues[bvIdx] = log_likelihood(data, modelState, queryState) boundIters[bvIdx] = itr perp = perplexity_from_like(likeValues[bvIdx], n.sum()) print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f bound %f" % (itr, perp, boundValues[bvIdx])) if bvIdx > 0 and boundValues[bvIdx - 1] > boundValues[bvIdx]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx])) # print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max())) # Check to see if the improvment in the likelihood has fallen below the threshold if bvIdx > 1 and boundIters[bvIdx] > 50: lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum()) if lastPerp - perp < 1: boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx) return modelState, queryState, (boundIters, boundValues, likeValues) bvIdx += 1 return \ ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \ QueryState(means, expMeans, varcs, lxi, s, n), \ (boundIters, boundValues, likeValues)
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words and features are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) W,X = data.words, data.feats # Enable logging or not. If enabled, we need the inner product of the feat matrix if debug: XTX = X.T.dot(X) debugFn = _debug_with_bound _debug_with_bound.old_bound=0 else: XTX = None debugFn = _debug_with_nothing # Iterate over parameters lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Estimate Z_dvk expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # Update the Means vMat = (2 * s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * 2 * lxi) # inverse of D diagonal matrices... means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent # to doing a hadamard product for all d debugFn (itr, means, "query-means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(2 * n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "query-varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = ctm.negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "query-lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) # debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, lxi, s, n)) perp = perplexity_from_like(like, data.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug, burnIn, thinning = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.burnIn, plan.thinning docLens, topicDists = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype W = data.words D,T = W.shape # Quick sanity check if np.any(docLens < 1): raise ValueError("Input document-term matrix contains at least one document with no words") assert dtype == np.float64, "Only implemented for 64-bit floats" iters, bnds, likes = [], [], [] sampleCount = 0 wordDistSamples = np.zeros((K, T), dtype=np.float64) topicDistSamples = np.zeros((D, K), dtype=np.float64) for itr in range(plan.iterations + plan.burnIn): topicDists = sample_memberships(W, topicPrior, wordDists, topicDists) wordDists = sample_dirichlet(W, vocabPrior, topicDists, wordDists) if is_sampling_iteration(itr, plan): wordDistSamples += wordDists topicDistSamples += topicDists sampleCount += 1 if itr % logFrequency == 0 or debug: m = ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, model.name) q = QueryState(query.docLens, topicDists, True) iters.append(itr) bnds.append(var_bound(data, m, q)) likes.append(log_likelihood(data, m, q)) perp = perplexity_from_like(likes[-1], W.sum()) print("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (itr, perp, bnds[-1])) # if len(iters) > 2 and iters[-1] > 50: # lastPerp = perplexity_from_like(likes[-2], W.sum()) # if lastPerp - perp < 1: # break; return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, model.name), \ QueryState(query.docLens, topicDists, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def _old_train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists ,dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype # Quick sanity check if np.any(docLens < 1): raise ValueError ("Input document-term matrix contains at least one document with no words") assert model.dtype == np.float64, "Only implemented for 64-bit floats" # Prepare the data for inference topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior) W = data.words D,T = W.shape iters, bnds, likes = [], [], [] # A few parameters for handling adaptive step-sizes in SGD grad = 0 grad_inner = 0 grad_rate = 1 log_likely = 0 # complete dataset likelihood for gradient adjustments stepSize = np.array([1.] * K, dtype=model.dtype) # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. diWordDistSums = np.empty((K,), dtype=dtype) diWordDists = np.empty(wordDists.shape, dtype=dtype) wordUpdates = wordDists.copy() if batchSize > 0 else None batchProcessCount = 0 # Amend the name if batchSize == 0 implying we're using SGD modelName = "lda/svbp/%s" % _sgd_desc(plan) \ if batchSize > 0 else model.name print (modelName) for itr in range(iterations): diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) if updateVocab: # Perform inference, updating the vocab if batchSize == 0: wordDists[:, :] = vocabPrior else: wordUpdates[:,:] = 0 for d in range(D): batchProcessCount += 1 #if debug and d % 100 == 0: printAndFlushNoNewLine(".") wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z if plan.rate_algor == RateAlgorAmaria: log_likely += 0 elif plan.rate_algor == RateAlgorVariance: g = wordDists.mean(axis=0) + vocabPrior grad *= (1 - grad_rate) grad += grad_rate * wordDists grad += grad_rate * vocabPrior gg += 0 elif plan.rate_algor != RateAlgorTimeKappa: raise ValueError("Unknown rate algorithm " + str(plan.rate_algor)) if batchSize > 0 and batchProcessCount == batchSize: batch_index = (itr * D + d) / batchSize #TODO Will not be right if batchSize is not a multiple of D stepSize = _step_sizes(stepSize, batch_index, g, gg, log_likely, plan) wordDists *= (1 - stepSize) wordDists += stepSize * vocabPrior stepSize *= float(D) / batchSize wordUpdates *= stepSize wordDists += wordUpdates diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) wordUpdates[:,:] = 0 batchProcessCount = 0 log_likely = 0 if debug: bnds.append(_var_bound_internal(data, model, query)) likes.append(_log_likelihood_internal(data, model, query)) perp = perplexity_from_like(likes[-1], W.sum()) print("Iteration %d, after %d docs: Train Perp = %4.0f Bound = %.3f" % (itr, batchSize, perp, bnds[-1])) sys.stdout.flush() # Log bound and the determine if we can stop early if itr % logFrequency == 0 or debug: iters.append(itr) bnds.append(_var_bound_internal(data, model, query)) likes.append(_log_likelihood_internal(data, model, query)) perp = perplexity_from_like(likes[-1], W.sum()) print ("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (itr, perp, bnds[-1])) if len(iters) > 2 and (iters[-1] > 20 or (iters[-1] > 2 and batchSize > 0)): lastPerp = perplexity_from_like(likes[-2], W.sum()) if lastPerp - perp < 1: print ("Converged, existing early") break; # Update hyperparameters (do this after bound, to make sure bound # calculation is internally consistent) if HyperUpdateEnabled and itr > 0 and itr % HyperParamUpdateInterval == 0: if debug: print("Topic Prior was " + str(topicPrior)) _updateTopicHyperParamsFromMeans(model, query) if debug: print("Topic Prior is now " + str(topicPrior)) else: for d in range(D): _ = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \ QueryState(docLens, topicMeans, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def query(data, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. The assumption is that there are no out-links associated with the documents, and that no documents in the training set link to any of these documents in the query set. The word and link vocabularies are kept fixed. Due to the assumption of no in-links, we don't learn the prior in-document covariance, nor the posterior distribution over in-links. Also, we don't modify Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats D,_ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Initialize some working variables W_weight = W.copy() outDocPre = 1./outDocCov inDocPre = np.reciprocal(inDocCov) itopicCov = la.inv(topicCov) # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow # Update the posterior variances outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:]) # Update the out-means and in-means out_rhs = w_top_sums.copy() # No link outputs to model. out_rhs += itopicCov.dot(topicMean) / outDocCov out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans)) for d in range(D): outCov = la.inv(outDocPre * itopicCov + emit_counts[d] * A) outMeans[d, :] = outCov.dot(out_rhs[d,:]) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME) queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens) boundValues.append(0) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: # Check to see if the improvement in the bound has fallen below the threshold if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens)
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None): ''' Take a feature list. Train on all documents where none of those features are set. Remove the first element from the feature list, query all documents with that feature set, and then evaluate link prediction. Repeat until feature-list is empty. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param feature_mask: the list of features used to separate training from query This is a list of tuples, the left side is the feature label, the right side is the :param model_dir: if not none, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 while len(feature_mask) > 0: # try: # Prepare the training and query data feature_mask_indices = [i for _,i in feature_mask] train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices) (feat_label, feat_id) = feature_mask.pop(0) print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80)) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models # Train the model if model_uses_lda(sample_model): ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices) model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities query_tops = mdl.newQueryState(query_data, model) _, query_tops = mdl.query(query_data, model, query_tops, train_plan) min_link_probs = mdl.min_link_probs(model, train_tops, query_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs) expected_links = query_data.links # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual LDA model. In a training run (query = False) this will be mutated in place, and then returned. queryState - the query results - essentially all the "local" variables matched to the given observations. This will be mutated in-place and then returned. trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) query - Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug W_list, docLens, topicDists = \ queryState.W_list, queryState.docLens, queryState.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype W = data.words D,T = W.shape # Quick sanity check if np.any(docLens < 1): raise ValueError ("Input document-term matrix contains at least one document with no words") # Book-keeping for logs logPoints = 1 if logFrequency == 0 else iterations // logFrequency boundIters = np.zeros(shape=(logPoints,)) boundValues = np.zeros(shape=(logPoints,)) likelyValues = np.zeros(shape=(logPoints,)) bvIdx = 0 # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F') # Select the training iterations function appropriate for the dtype current_micro_time = lambda: int(time.time()) do_iterations = compiled.iterate_f32 \ if modelState.dtype == np.float32 \ else compiled.iterate_f64 # do_iterations = iterate # pure Python # Iterate in segments, pausing to take measures of the bound / likelihood segIters = logFrequency remainder = iterations - segIters * (logPoints - 1) totalItrs = 0 for segment in range(logPoints - 1): start = current_micro_time() totalItrs += do_iterations (segIters, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) duration = current_micro_time() - start boundIters[bvIdx] = segment * segIters boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) perp = perplexity_from_like(likelyValues[bvIdx], W.sum()) bvIdx += 1 if converged (boundIters, boundValues, bvIdx, epsilon, minIters=20): boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likelyValues, bvIdx) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues) print ("Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f" % (segment, logPoints, totalItrs, duration, perp, boundValues[bvIdx - 1], likelyValues[bvIdx - 1])) # Final batch of iterations. do_iterations (remainder, D, K, T, \ W_list, docLens, \ topicPrior, vocabPrior, \ z_dnk, topicDists, wordDists) boundIters[bvIdx] = iterations - 1 boundValues[bvIdx] = var_bound(data, modelState, queryState) likelyValues[bvIdx] = log_likelihood(data, modelState, queryState) return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, topicDists), \ (boundIters, boundValues, likelyValues)
def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, corpusTopicDist, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.corpusTopicDist, model.dtype W = data.words iters, bnds, likes = [], [], [] # Quick sanity check if np.any(docLens < 1): raise ValueError("Input document-term matrix contains at least one document with no words") assert dtype == np.float64, "Only implemented for 64-bit floats" for itr in range(iterations): # E-Step safe_log(wordDists, out=wordDists) safe_log(corpusTopicDist, out=corpusTopicDist) topicDists = W.dot(wordDists.T) + corpusTopicDist[np.newaxis, :] #topicDists -= topicDists.max(axis=1)[:, np.newaxis] # TODO Ensure this is okay norms = fns.logsumexp(topicDists, axis=1) topicDists -= norms[:, np.newaxis] np.exp(topicDists, out=topicDists) # M-Step wordDists = (W.T.dot(topicDists)).T wordDists += vocabPrior wordDists /= wordDists.sum(axis=1)[:, np.newaxis] corpusTopicDist = topicDists.sum(axis=0) corpusTopicDist[:] += topicPrior corpusTopicDist /= corpusTopicDist.sum() if itr % logFrequency == 0 or debug: m = ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name) q = QueryState(query.docLens, topicDists, True) iters.append(itr) bnds.append(var_bound(data, m, q)) likes.append(log_likelihood(data, m, q)) perp = perplexity_from_like(likes[-1], W.sum()) print("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (itr, perp, bnds[-1])) if len(iters) > 2 and iters[-1] > 50: lastPerp = perplexity_from_like(likes[-2], W.sum()) if lastPerp - perp < 1: break; return ModelState(K, topicPrior, vocabPrior, wordDists, corpusTopicDist, True, dtype, model.name), \ QueryState(query.docLens, topicDists, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def testMapOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: dic = pkl.load(f) data.convert_to_dtype(dtype) data.convert_to_undirected_graph() data.convert_to_binary_link_matrix() data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) trainData, testData = data.doc_completion_split() for pseudoNegCount in (5, 10, 25, 50, 100): rd.seed(0xC0FFEE) # Initialise the model K = TopicCount model = rtm.newModelAtRandom(trainData, K, dtype=dtype, pseudoNegCount=data.doc_count * pseudoNegCount) queryState = rtm.newQueryState(trainData, model) trainPlan = rtm.newTrainPlan(iterations=50, logFrequency=LogFreq, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, topics, (bndItrs, bndVals, bndLikes) = rtm.train(trainData, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = rtm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k, :], topWordCount) for k in range(K)] like = rtm.log_likelihood(trainData, model, topics) perp = perplexity_from_like(like, trainData.word_count) # print ("Prior %s" % (str(model.topicPrior))) print ("Pseudo Neg-Count: %d " % pseudoNegCount) print ("\tTrain Perplexity: %f\n\n" % perp) # for k in range(model.K): # print ("\nTopic %d\n=============================" % k) # print ("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) min_probs = rtm.min_link_probs(model, topics, testData.links) link_probs = rtm.link_probs(model, topics, min_probs) try: map = mean_average_prec(testData.links, link_probs) except: print ("Unexpected error") print("\tThe Mean-Average-Precision is %.3f" % map)
def testPerplexityOnRealDataWithMtm2(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 30 # TopicCount model = mtm2.newModelAtRandom(data, K, dtype=dtype) queryState = mtm2.newQueryState(data, model) trainPlan = mtm2.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = mtm2.train(data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() fig, ax1 = plt.subplots() ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r) fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = mtm2.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = mtm2.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print("Perplexity: %f\n\n" % perp) for k in range(model.K): print("\nTopic %d\n=============================" % k) print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) print ("Most likely documents for each topic") print ("====================================") with open ("/Users/bryanfeeney/iCloud/Datasets/ACL/ACL.100/doc_ids.pkl", 'rb') as f: fileIds = pkl.load (f) docs_dict = [fileIds[fi] for fi in data.order] for k in range(model.K): arg_max_prob = np.argmax(query.means[:, k]) print("K=%2d Document ID = %s (found at %d)" % (k, docs_dict[arg_max_prob], arg_max_prob)) print ("Done") with open ("/Users/bryanfeeney/Desktop/mtm2-" + str(K) + ".pkl", "wb") as f: pkl.dump((model, query), f)
def link_split_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' Train on all the words and half the links. Predict the remaining links. Evaluate using precision at m using as values of m 50, 100, 250, and 500, and additionally recall at m Cross validation may be used, but note we're always evaluating on training data. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if ldaModel is not None: (_, _, _, _, ldaModel, ldaTopics, _) = ldaModel if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 for fold in range(target_folds): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) \ if sample_model.name == LRO_MODEL_NAME \ else mdl.newModelFromExisting(sample_model) train_data, query_data = data.link_prediction_split(symmetric=False) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) min_link_probs = mdl.min_link_probs(model, train_tops, query_data.links) predicted_link_probs = mdl.link_probs(model, train_tops, min_link_probs) expected_links = query_data.links precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) return model_files
def train (data, modelState, queryState, trainPlan): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters ''' W, L, LT, X = data.words, data.links, ssp.csr_matrix(data.links.T), data.feats D,_ = W.shape out_links = np.squeeze(np.asarray(data.links.sum(axis=1))) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens = queryState.outMeans, queryState.outVarcs, queryState.inMeans, queryState.inVarcs, queryState.inDocCov, queryState.docLens K, topicMean, topicCov, outDocCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.outDocCov, modelState.vocab, modelState.A, modelState.dtype emit_counts = docLens + out_links # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] if debug: debugFn = _debug_with_bound initLikely = log_likelihood(data, modelState, queryState) initPerp = perplexity_from_like(initLikely, data.word_count) print ("Initial perplexity is: %.2f" % initPerp) else: debugFn = _debug_with_nothing # Initialize some working variables W_weight = W.copy() L_weight = L.copy() LT_weight = LT.copy() inDocCov, inDocPre = np.ones((D,)), np.ones((D,)) # Interestingly, outDocCov trades off good perplexity fits # with good ranking fits. > 10 gives better perplexity and # worse ranking. At 10 both are good. Below 10 both get # worse. Below 0.5, convergence stalls after the first iter. outDocCov, outDocPre = 10, 1./10 # Iterate over parameters for itr in range(iterations): # We start with the M-Step, so the parameters are consistent with our # initialisation of the RVs when we do the E-Step # Update the mean and covariance of the prior over out-topics topicMean = outMeans.mean(axis=0) debugFn (itr, topicMean, "topicMean", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) outDiff = outMeans - topicMean[np.newaxis, :] inDiff = inMeans - outMeans for _ in range(5): # It typically takes three iterations for the three dependant covariances - # outDocCov, inDocCov and topicCov - to become consistent w.r.t each other topicCov = (outDocPre * outDiff).T.dot(outDiff) topicCov += (inDocPre[:,np.newaxis] * inDiff).T.dot(inDiff) topicCov += np.diag(outVarcs.sum(axis=0)) topicCov += np.diag(inVarcs.sum(axis=0)) topicCov += IWISH_S_SCALE * np.eye(K) topicCov /= (2 * D + IWISH_DENOM) itopicCov = la.inv(topicCov) debugFn (itr, topicMean, "topicCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) diffSig = inDiff.dot(itopicCov) diffSig *= inDiff inDocCov = diffSig.sum(axis=1) inDocCov += (outVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1) inDocCov += (inVarcs * np.diagonal(itopicCov)[np.newaxis, :]).sum(axis=1) inDocCov += IGAMMA_B inDocCov /= (IGAMMA_A - 1 + K) inDocPre = np.reciprocal(inDocCov) debugFn (itr, inDocCov, "inDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) diffSig = outDiff.dot(itopicCov) diffSig *= outDiff # outDocCov = (IGAMMA_B + diffSig.sum() + (np.diagonal(itopicCov) * outVarcs).sum()) / (IGAMMA_A - 1 + (D * K)) # outDocPre = 1./outDocCov debugFn (itr, outDocCov, "outDocCov", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Apply the exp function to get the (unnormalised) softmaxes in both directions. expMeansCol = np.exp(inMeans - inMeans.max(axis=0)[np.newaxis, :]) lse_at_k = np.sum(expMeansCol, axis=0) F = 0.5 * inMeans \ - (0.5/ D) * inMeans.sum(axis=0) \ - expMeansCol / lse_at_k[np.newaxis, :] expMeansRow = np.exp(outMeans - outMeans.max(axis=1)[:, np.newaxis]) W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) # Update the vocabularies vocab *= (W_weight.T.dot(expMeansRow)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += VocabPrior vocab = normalizerows_ip(vocab) docVocab = (expMeansCol / lse_at_k[np.newaxis, :]).T.copy() # FIXME Dupes line in definition of F # Recalculate w_top_sums with the new vocab and log vocab improvement W_weight = sparseScalarQuotientOfDot(W, expMeansRow, vocab, out=W_weight) w_top_sums = W_weight.dot(vocab.T) * expMeansRow debugFn (itr, vocab, "vocab", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Now do likewise for the links, do it twice to model in-counts (first) and # out-counts (Second). The difference is the transpose LT_weight = sparseScalarQuotientOfDot(LT, expMeansRow, docVocab, out=LT_weight) l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow in_counts = l_intop_sums.sum(axis=0) L_weight = sparseScalarQuotientOfDot(L, expMeansRow, docVocab, out=L_weight) l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow # Update the posterior variances outVarcs = np.reciprocal(emit_counts[:, np.newaxis] * (K-1)/(2*K) + (outDocPre + inDocPre[:,np.newaxis]) * np.diagonal(itopicCov)[np.newaxis,:]) debugFn (itr, outVarcs, "outVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) inVarcs = np.reciprocal(in_counts[np.newaxis,:] * (D-1)/(2*D) + inDocPre[:,np.newaxis] * np.diagonal(itopicCov)[np.newaxis,:]) debugFn (itr, inVarcs, "inVarcs", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # Update the out-means and in-means out_rhs = w_top_sums.copy() out_rhs += l_outtop_sums out_rhs += itopicCov.dot(topicMean) / outDocCov out_rhs += inMeans.dot(itopicCov) / inDocCov[:,np.newaxis] out_rhs += emit_counts[:, np.newaxis] * (outMeans.dot(A) - rowwise_softmax(outMeans)) scaled_n_in = ((D-1.)/(2*D)) * ssp.diags(in_counts, 0) in_rhs = (inDocPre[:, np.newaxis] * outMeans).dot(itopicCov) in_rhs += ((-inMeans.sum(axis=0) * in_counts) / (4*D))[np.newaxis,:] in_rhs += l_intop_sums in_rhs += in_counts[np.newaxis, :] * F for d in range(D): in_rhs[d, :] += in_counts * inMeans[d, :] / (4*D) inMeans[d, :] = la.inv(inDocPre[d] * itopicCov + scaled_n_in).dot(in_rhs[d, :]) in_rhs[d,:] -= in_counts * inMeans[d, :] / (4*D) try: outCov = la.inv((outDocPre + inDocPre[d]) * itopicCov + emit_counts[d] * A) outMeans[d, :] = outCov.dot(out_rhs[d,:]) except la.LinAlgError as err: print ("ABORTING: " + str(err)) return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues)) debugFn (itr, outMeans, "inMeans/outMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) # debugFn (itr, inMeans, "inMeans", data, K, topicMean, topicCov, outDocCov, inDocCov, vocab, dtype, outMeans, outVarcs, inMeans, inVarcs, A, docLens) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME) queryState = QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum()))) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if itr > MinItersBeforeEarlyStop and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0: break # if True or debug or itr % logFrequency == 0: # print(" Sigma %6.1f \t %9.3g, %9.3g, %9.3g" % (np.log(la.det(topicCov)), topicCov.min(), topicCov.mean(), topicCov.max()), end=" |") # print(" rho %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(log(inDocCov[d]) for d in range(D)), inDocCov.min(), inDocCov.mean(), inDocCov.max()), end=" |") # print(" alpha %6.1f \t %9.3g" % (np.log(la.det(np.eye(K,) * outDocCov)), outDocCov), end=" |") # print(" inMeans %9.3g, %9.3g, %9.3g" % (inMeans.min(), inMeans.mean(), inMeans.max()), end=" |") # print(" outMeans %9.3g, %9.3g, %9.3g" % (outMeans.min(), outMeans.mean(), outMeans.max()), end=" |") # print(" inVarcs %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(inVarcs[d])) for d in range(D)) / D, inVarcs.min(), inVarcs.mean(), inVarcs.max()), end=" |") # print(" outVarcs %6.1f \t %9.3g, %9.3g, %9.3g" % (sum(safe_log_det(np.diag(outVarcs[d])) for d in range(D)) / D, outVarcs.min(), outVarcs.mean(), outVarcs.max())) return \ ModelState(K, topicMean, topicCov, outDocCov, vocab, A, True, dtype, MODEL_NAME), \ QueryState(outMeans, outVarcs, inMeans, inVarcs, inDocCov, docLens), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def train(data, modelState, queryState, trainPlan): """ Infers the topic distributions in general, and specifically for each individual datapoint. Params: W - the DxT document-term matrix X - The DxF document-feature matrix, which is IGNORED in this case modelState - the actual CTM model queryState - the query results - essentially all the "local" variables matched to the given observations trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) Return: A new model object with the updated model (note parameters are updated in place, so make a defensive copy if you want itr) A new query object with the update query parameters """ W, X = data.words, data.feats D, T = W.shape F = X.shape[1] # tmpNumDense = np.array([ # 4 , 8 , 2 , 0 , 0, # 0 , 6 , 0 , 17, 0, # 12 , 13 , 1 , 7 , 8, # 0 , 5 , 0 , 0 , 0, # 0 , 6 , 0 , 0 , 44, # 0 , 7 , 2 , 0 , 0], dtype=np.float64).reshape((6,5)) # tmpNum = ssp.csr_matrix(tmpNumDense) # # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10 # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64) # # tmpResult = tmpNum.copy() # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight) # # print (str(tmpNum.todense())) # print (str(tmpDenomleft.dot(tmpDenomRight))) # print (str(tmpResult.todense())) # Unpack the the structs, for ease of access and efficiency iterations, epsilon, logFrequency, diagonalPriorCov, debug = ( trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug, ) means, docLens = queryState.means, queryState.docLens K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = ( modelState.K, modelState.A, modelState.U, modelState.Y, modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype, ) tp, fp, ltp, lfp = 1.0 / tv, 1.0 / fv, 1.0 / ltv, 1.0 / lfv # turn variances into precisions # FIXME Use passed in hypers print("tp = %f tv=%f" % (tp, tv)) vocabPrior = np.ones(shape=(T,), dtype=modelState.dtype) # FIXME undo truncation F = 363 A = A[:F, :] X = X[:, :F] U = U[:F, :] data = DataSet(words=W, feats=X) # Book-keeping for logs boundIters, boundValues, likelyValues = [], [], [] debugFn = _debug_with_bound if debug else _debug_with_nothing # Initialize some working variables if covA is None: precA = (fp * ssp.eye(F) + X.T.dot(X)).todense() # As the inverse is almost always dense covA = la.inv(precA, overwrite_a=True) # it's faster to densify in advance uniqLens = np.unique(docLens) debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K) expMeans = means.copy() expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy()) lhs = H.copy() rhs = expMeans.copy() Y_rhs = Y.copy() # Iterate over parameters for itr in range(iterations): # Update U, V given A V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y), A.T.dot(U).dot(Y).T).T V /= V[0, 0] U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T), A.dot(V).dot(Y.T).T).T # Update Y given U, V, A Y_rhs[:, :] = U.T.dot(A).dot(V) Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True) Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True) s = np.outer(Sv, Su).flatten() s += ltv * lfv np.reciprocal(s, out=s) M = Uu.T.dot(Y_rhs).dot(Uv) M *= unvec(s, row_count=M.shape[0]) Y = Uu.dot(M).dot(Uv.T) debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means)) debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # And now this is the E-Step, though itr's followed by updates for the # parameters also that handle the log-sum-exp approximation. # TODO One big sort by size, plus batch it. # Update the Means rhs[:, :] = expMeans rhs *= R.dot(vocab.T) rhs += X.dot(A) * tp rhs += docLens[:, np.newaxis] * means.dot(H) rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means) for l in uniqLens: inds = np.where(docLens == l)[0] lhs[:, :] = l * H lhs[np.diag_indices_from(lhs)] += tp lhs[:, :] = la.inv(lhs) means[inds, :] = rhs[inds, :].dot(lhs) # left and right got switched going from vectors to matrices :-/ debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # Standard deviation # DK = means.shape[0] * means.shape[1] # newTp = np.sum(means) # newTp = (-newTp * newTp) # rhs[:,:] = means # rhs *= means # newTp = DK * np.sum(rhs) - newTp # newTp /= DK * (DK - 1) # newTp = min(max(newTp, 1E-36), 1E+36) # tp = 1 / newTp # if itr % logFrequency == 0: # print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,))))) # Update the vocabulary expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R) vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense) vocab += vocabPrior vocab = normalizerows_ip(vocab) debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior) # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min())) # Update the vocab prior # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior) # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean())) if logFrequency > 0 and itr % logFrequency == 0: modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name) queryState = QueryState(means, docLens) boundValues.append(var_bound(data, modelState, queryState)) likelyValues.append(log_likelihood(data, modelState, queryState)) boundIters.append(itr) print( time.strftime("%X") + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())) ) if len(boundValues) > 1: if boundValues[-2] > boundValues[-1]: if debug: printStderr("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1])) # Check to see if the improvement in the bound has fallen below the threshold if ( itr > 100 and len(likelyValues) > 3 and abs( perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum()) ) < 1.0 ): break return ( ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), QueryState(means, expMeans, docLens), (np.array(boundIters), np.array(boundValues), np.array(likelyValues)), )
def query(dataset, modelState, queryState, queryPlan): ''' Given a _trained_ model, attempts to predict the topics for each of the inputs. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the _trained_ model queryState - the query state generated for the query dataset queryPlan - used in this case as we need to tighten up the approx Returns: The model state and query state, in that order. The model state is unchanged, the query is. ''' W = dataset.words D = W.shape[0] iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens K, topicMean, sigT, vocab, vocabPrior, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype # Necessary temp variables (notably the count of topic to word assignments # per topic per doc) isigT = la.inv(sigT) expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans) R = sparseScalarQuotientOfDot(W, expMeans, vocab) S = expMeans * R.dot(vocab.T) # Enable logging or not. If enabled, we need the inner product of the feat matrix debugFn = _debug_with_bound if debug else _debug_with_nothing # Iterate over parameters lastPerp = 1E+300 if dtype is np.float64 else 1E+30 for itr in range(iterations): # Update the Means vMat = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S rhsMat = vMat + isigT.dot(topicMean) for d in range(D): try: means[d,:] = la.inv(isigT + ssp.diags(n[d] * lxi[d,:], 0)).dot(rhsMat[d,:]) except ValueError as e: print(str(e)) print ("Ah") debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the Variances varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1]) debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # Update the approximation parameters lxi = 2 * negJakkolaOfDerivedXi(means, varcs, s) debugFn (itr, lxi, "lxi", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) # s can sometimes grow unboundedly # Follow Bouchard's suggested approach of fixing it at zero # s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1) debugFn (itr, s, "s", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n) like = log_likelihood(dataset, modelState, QueryState(means, expMeans, varcs, lxi, s, n)) perp = perplexity_from_like(like, dataset.word_count) if itr > 20 and lastPerp - perp < 1: break lastPerp = perp return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
def insample_lro_style_prec_rec (data, mdl, sample_model, train_plan, folds, target_folds=None, model_dir=None, ldaModel=None, ldaTopics=None): ''' For documents with > 5 links remove a portion. The portion is determined by the number of folds (e.g. five-fold implied remove one fifth of links, three fold implies remove a third, etc.) Train on all documents and all remaining links. Predict remaining links. Evaluate using precision@m, recall@m, mean reciprocal-rank and mean average-precision Average all results. :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param folds: the number of folds to cross validation :param target_folds: the number of folds to complete before finishing. Set to folds by default :param model_dir: if not none, and folds > 1, the models are stored in this directory. :param ldaModel: for those models that utilise and LDA component, a pre-trained LDA model can be supplied. :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel :return: the list of model files stored ''' ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500] model_files = [] assert folds > 1, "Need at least two folds for this to make any sense whatsoever" def prepareForTraining(data): if mdl.is_undirected_link_predictor(): result = data.copy() result.convert_to_undirected_graph() result.convert_to_binary_link_matrix() return result else: return data if target_folds is None: target_folds = folds combi_precs, combi_recs, combi_dcounts = None, None, None mrr_sum, mrr_doc_count = 0, 0 map_sum, map_doc_count = 0, 0 fold_count = 0 for fold in range(folds): # try: # Prepare the training and query data train_data, query_data, docSubset = data.folded_link_prediction_split(MinLinkCountEval, fold, folds) train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we # can compare symmetric with non-symmetric models print ("\n\nFold %d\n" % fold + ("-" * 80)) # Train the model if model_uses_lda(sample_model): model = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModel) train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopics) else: model = mdl.newModelFromExisting(sample_model) train_tops = mdl.newQueryState(train_data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = \ mdl.train(train_data, model, train_tops, train_plan) print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count)) # Infer the expected link probabilities min_link_probs = mdl.min_link_probs(model, train_tops, train_tops, query_data.links, docSubset) predicted_link_probs = mdl.link_probs(model, train_tops, train_tops, min_link_probs, docSubset) expected_links = query_data.links[docSubset, :] # Evaluation 1/3: Precision and Recall at M precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)]) print ("Fold %2d: Mean-Precisions at \n" % fold, end="") printTable("Precision", precs, doc_counts, ms) printTable("Recall", recs, doc_counts, ms) combi_precs, _ = combine_map(combi_precs, combi_dcounts, precs, doc_counts) combi_recs, combi_dcounts = combine_map(combi_recs, combi_dcounts, recs, doc_counts) # Evaluation 2/3: Mean Reciprocal-Rank mrr = mean_reciprocal_rank(expected_links, predicted_link_probs) print ("Mean reciprocal-rank : %f" % mrr) mrr_sum += mrr * expected_links.shape[0] mrr_doc_count += expected_links.shape[0] # Evaluation 3/3: Mean Average-Precision map = mean_average_prec (expected_links, predicted_link_probs) print ("Mean Average Precision : %f" % map) map_sum += map * expected_links.shape[0] map_doc_count += expected_links.shape[0] # Save the files if necessary and move onto the next fold if required model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) fold_count += 1 if fold_count == target_folds: break # except Exception as e: # print("Fold " + str(fold) + " failed: " + str(e)) print ("-" * 80 + "\n\n Final Results\n\n") printTable("Precision", combi_precs, combi_dcounts, ms) printTable("Recall", combi_recs, combi_dcounts, ms) print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count)) print("Mean average-precision: %f" % (map_sum / map_doc_count)) return model_files
def train (data, modelState, queryState, trainPlan, query=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: data - the dataset of words, features and links of which only words are used in this model modelState - the actual LDA model. In a training run (query = False) this will be mutated in place, and then returned. queryState - the query results - essentially all the "local" variables matched to the given observations. This will be mutated in-place and then returned. trainPlan - how to execute the training process (e.g. iterations, log-interval etc.) query - Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug W_list, docLens, q_n_dk, q_n_kt, q_n_k, q_v_dk, q_v_kt, q_v_k, z_dnk = \ queryState.W_list, queryState.docLens, \ queryState.n_dk, queryState.n_kt, queryState.n_k, \ queryState.v_dk, queryState.v_kt, queryState.v_k, queryState.z_dnk K, topicPrior_, vocabPrior, m_n_dk, m_n_kt, m_n_k, m_v_dk, m_v_kt, m_v_k = \ modelState.K, modelState.topicPrior, modelState.vocabPrior, \ modelState.n_dk, modelState.n_kt, modelState.n_k, \ modelState.v_dk, modelState.v_kt, modelState.v_k topicPrior = topicPrior_.mean() D_train = 0 if m_n_dk is None else m_n_dk.shape[0] D_query = q_n_dk.shape[0] W = data.words T = W.shape[1] # Quick sanity check if np.any(docLens < 1): raise ValueError ("Input document-term matrix contains at least one document with no words") # Book-keeping for logs logPoints = 1 if logFrequency == 0 else iterations // logFrequency boundIters = [] boundValues = [] likelyValues = [] # Early stopping check finishedTraining = False # Add the model counts (essentially the learnt model parameters) to those for # the query, assuming the model has been trained previously if m_n_dk is not None: np.add (q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt += m_n_kt np.add (q_v_kt, m_v_kt, out=q_v_kt) np.add (q_n_k, m_n_k, out=q_n_k) # q_n_k += m_n_k np.add (q_v_k, m_v_k, out=q_v_k) # print ("Topic prior : " + str(topicPrior)) # Select the training iterations function appropriate for the dtype if debug: print ("Starting Training") do_iterations = compiled.iterate_f32 \ if modelState.dtype == np.float32 \ else compiled.iterate_f64 # Iterate in segments, pausing to take measures of the bound / likelihood segIters = logFrequency remainder = iterations - segIters * (logPoints - 1) for segment in range(logPoints - 1): do_iterations (segIters, D_query, D_train, K, T, \ W_list, docLens, \ q_n_dk, q_n_kt, q_n_k, \ q_v_dk, q_v_kt, q_v_k, \ z_dnk,\ topicPrior, vocabPrior) # Measure and record the improvement to the bound and log-likely boundIters.append (segment * segIters) boundValues.append (var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k)) likelyValues.append (log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k)) # Check to see if the improvement in the bound has fallen below the threshold perp = perplexity_from_like(likelyValues[-1], W.sum()) print("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (segment * segIters, perp, boundValues[-1])) if len(boundIters) > 2 and (boundIters[-1] > 30): lastPerp = perplexity_from_like(likelyValues[-2], W.sum()) if lastPerp - perp < 1: finishedTraining = True print("Converged, existing early") break # Final scheduled batch of iterations if we haven't already converged. if not finishedTraining: do_iterations (remainder, D_query, D_train, K, T, \ W_list, docLens, \ q_n_dk, q_n_kt, q_n_k, \ q_v_dk, q_v_kt, q_v_k, \ z_dnk,\ topicPrior, vocabPrior) boundIters.append (iterations - 1) boundValues.append (var_bound_intermediate(data, modelState, queryState, q_n_kt, q_n_k)) likelyValues.append (log_likely_intermediate(data, modelState, queryState, q_n_kt, q_n_k)) # Now return the results if query: # Model is unchanged, query is changed if m_n_dk is not None: np.subtract(q_n_kt, m_n_kt, out=q_n_kt) # q_n_kt -= m_n_kt np.subtract(q_v_kt, m_v_kt, out=q_v_kt) np.subtract(q_n_k, m_n_k, out=q_n_k) # q_n_k -= m_n_k np.subtract(q_v_k, m_v_k, out=q_v_k) # q_n_k -= m_n_k else: # train # Model is changed. Query is changed if m_n_dk is not None: # Amend existing m_n_dk = np.vstack((m_n_dk, q_n_dk)) m_n_kt[:,:] = q_n_kt # Recall we _added_ the m_n_kt counts to the query m_n_k[:] = q_n_k # before training, so now the query-counts contain the # sum of old and new, and can just be copied across m_v_dk = np.vstack((m_v_dk, q_v_dk)) m_v_kt[:,:] = q_v_kt m_n_k[:] = q_v_k else: # Create from scratch m_n_dk = q_n_dk.copy() m_n_kt = q_n_kt.copy() m_n_k = q_n_k.copy() m_v_dk = q_v_dk.copy() m_v_kt = q_v_kt.copy() m_v_k = q_v_k.copy() return ModelState(K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k, m_v_dk, m_v_kt, m_v_k, modelState.dtype, modelState.name), \ QueryState(W_list, docLens, q_n_dk, q_n_kt, q_n_k, q_v_dk, q_v_kt, q_v_k, z_dnk), \ (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None): ''' Uses cross-validation go get the average perplexity. If folds == 1 a special path is triggered where perplexity is evaluated on the training data, and the results are not saved to disk, even if model_dir is not none :param data: the DataSet object with the data :param mdl: the module with the train etc. functin :param sample_model: a preconfigured model which is cloned at the start of each cross-validation run :param train_plan: the training plan (number of iterations etc.) :param query_plan: the query play (number of iterations etc.) :param num_folds: the number of folds to cross validation :param fold_run_count: for debugging stop early after processing the number of the folds :param model_dir: if not none, the models are stored in this directory. :return: the list of model files stored ''' model_files = [] if fold_run_count < 1: fold_run_count = num_folds if num_folds == 1: model = mdl.newModelFromExisting(sample_model) query = mdl.newQueryState(data, model) model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan) likely = mdl.log_likelihood(data, model, train_tops) perp = perplexity_from_like(likely, data.word_count) print("Train-set Likelihood: %12f" % (likely)) print("Train-set Perplexity: %12f" % (perp)) model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl) return model_files query_like_sum = 0 # to calculate the overall likelihood and query_wcount_sum = 0 # perplexity for the whole dataset train_like_sum = 0 train_wcount_sum = 0 folds_finished = 0 # count of folds that finished successfully fold = 0 while fold < num_folds and folds_finished < fold_run_count: try: train_data, query_data = data.cross_valid_split(fold, num_folds) # Train the model print ("Duplicating model template... ", end="") model = mdl.newModelFromExisting(sample_model) print ("Done.\nCreating query state...") train_tops = mdl.newQueryState(train_data, model) print ("Starting training") model, train_tops, (train_itrs, train_vbs, train_likes) \ = mdl.train(train_data, model, train_tops, train_plan) train_like = mdl.log_likelihood (train_data, model, train_tops) train_word_count = train_data.word_count train_perp = perplexity_from_like(train_like, train_word_count) print ("DEBUG Train perplexity is " + str(train_perp)) # Query the model - if there are no features we need to split the text print ("Starting query.") query_estim, query_eval = query_data.doc_completion_split() query_tops = mdl.newQueryState(query_estim, model) model, query_tops = mdl.query(query_estim, model, query_tops, query_plan) query_like = mdl.log_likelihood(query_eval, model, query_tops) query_word_count = query_eval.word_count query_perp = perplexity_from_like(query_like, query_word_count) # Keep a record of the cumulative likelihood and query-set word-count train_like_sum += train_like train_wcount_sum += train_word_count query_like_sum += query_like query_wcount_sum += query_word_count folds_finished += 1 # Write out the output print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp)) print("") # Save the model model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl) # except Exception as e: # traceback.print_exc() # print("Abandoning fold %d due to the error : %s" % (fold, str(e))) finally: fold += 1 print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum))) print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum))) return model_files