def testPerplexityOnRealDataWithCtm(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 # TopicCount model = ctm.newModelAtRandom(data, K, dtype=dtype) queryState = ctm.newQueryState(data, model) trainPlan = ctm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() fig, ax1 = plt.subplots() ax1.imshow(model.sigT, interpolation="nearest", cmap=cm.Greys_r) fig.show() plt.show() # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) vocab = ctm.wordDists(model) topWordCount = 10 kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = ctm.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) print ("Perplexity: %f\n\n" % perp) for k in range(model.K): print("\nTopic %d\n=============================" % k) print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
def testCrossValPerplexityOnRealDataWithCtmInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = ctm.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=False) topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(1): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = ctm.newModelAtRandom(trainData, K, dtype=dtype) query = ctm.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = ctm.train (trainData, model, query, trainPlan) like = ctm.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = ctm.newQueryState(queryData, model) model, queryResult = ctm.query(queryData, model, query, queryPlan) like = ctm.log_likelihood(queryData, model, queryResult) perp = perplexity_from_like(like, queryData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / NumFolds) queryPerps.append(sum(queryPerps) / NumFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=20, logFrequency=1, fastButInaccurate=useDiagonalPriorCov) model, queryState, (bndItrs, bndVals) = ctm.train (W_train, None, model, queryState, plan) # Plot the evoluation of the bound during training. plt.plot(bndItrs[5:], bndVals[5:]) plt.xlabel("Iterations") plt.ylabel("Variational Bound") plt.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")