def testCrossValPerplexityOnRealDataWithLdaOldInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = lda_old.newTrainPlan(iterations=800, logFrequency=200, fastButInaccurate=False, debug=False) queryPlan = lda_old.newTrainPlan(iterations=24, logFrequency=12, fastButInaccurate=False, debug=False) topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = lda_old.newModelAtRandom(trainData, K, dtype=dtype) query = lda_old.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda_old.train (trainData, model, query, trainPlan) like = lda_old.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) query = lda_old.newQueryState(queryData, model) model, queryResult = lda_old.query(queryData, model, query, queryPlan) like = lda_old.log_likelihood(queryData, model, queryResult) perp = perplexity_from_like(like, queryData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def testOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != dtype: W = W.astype(dtype) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 model = lda.newModelAtRandom(W, K, dtype=dtype) queryState = lda.newQueryState(W, model) trainPlan = lda.newTrainPlan(iterations=40, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (W, None, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() vocab = lda.vocab(model) plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [self.topWordInds(d, vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Prior %s" % (str(model.topicPrior))) print ("Perplexity: %f\n\n" % lda.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))