def testCrossValPerplexityOnRealDataWithCtmInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = ctm.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=False) topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(1): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = ctm.newModelAtRandom(trainData, K, dtype=dtype) query = ctm.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = ctm.train (trainData, model, query, trainPlan) like = ctm.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = ctm.newQueryState(queryData, model) model, queryResult = ctm.query(queryData, model, query, queryPlan) like = ctm.log_likelihood(queryData, model, queryResult) perp = perplexity_from_like(like, queryData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / NumFolds) queryPerps.append(sum(queryPerps) / NumFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=20, logFrequency=1, fastButInaccurate=useDiagonalPriorCov) model, queryState, (bndItrs, bndVals) = ctm.train (W_train, None, model, queryState, plan) # Plot the evoluation of the bound during training. plt.plot(bndItrs[5:], bndVals[5:]) plt.xlabel("Iterations") plt.ylabel("Variational Bound") plt.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")