def testOnRealData(self): print ("CTM/Bohning") rd.seed(0xC0FFEE) dtype = np.float64 path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != dtype: W = W.astype(dtype) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 20 model = ctm.newModelAtRandom(W, K, dtype=dtype) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=750, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan) with open(newModelFile("ctm-bohn-nips-ar", K, None), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the bound fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() fig.suptitle("CTM/Bohning (Identity Cov) on NIPS") plt.show() topWordCount = 100 kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def testOnRealData(self): rd.seed(0xDAFF0D12) # path = "/Users/bryanfeeney/Desktop/NIPS" # with open(path + "/ar.pkl", "rb") as f: # X, W, _, dic = pkl.load(f) path = "/Users/bryanfeeney/Desktop/SmallerDB-NoCJK-WithFeats-Fixed" with open(path + "/all-in-one.pkl", "rb") as f: W, X, dic = pkl.load(f) if W.dtype != DTYPE: W = W.astype(DTYPE) if X.dtype != DTYPE: X = X.astype(DTYPE) D,T = W.shape _,F = X.shape freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1. + freq) K = 10 P = 5 model = stm.newModelAtRandom(X, W, P, K, 0.1, 0.1, dtype=DTYPE) queryState = stm.newQueryState(W, model) trainPlan = stm.newTrainPlan(iterations=50, logFrequency=1, debug=True) model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan) with open(newModelFile("stm-yv-bohn-nips-ar", K, None), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() # Print the top words topWordCount = 100 kTopWordInds = [self.topWordInds(dic, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (dic[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))