Exemplo n.º 1
0
 def testOnRealData(self):
     rd.seed(0xDAFF0D12)
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", "rb") as f:
         X, W, feats_dict, dic = pkl.load(f)
     
     if W.dtype != DTYPE:
         W = W.astype(DTYPE)
     if X.dtype != DTYPE:
         X = X.astype(DTYPE)
     
     D,T = W.shape
     _,F = X.shape
     
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1. + freq)
     
     K = 10
     P = 30
     model      = stm.newModelAtRandom(X, W, P, K, 0.1, 0.1, dtype=DTYPE)
     queryState = stm.newQueryState(W, model)
     trainPlan  = stm.newTrainPlan(iterations=100, logFrequency=1, fastButInaccurate=True, debug=True)
     
     model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan)
     with open(newModelFile("stm-yv-bou-nips-ar", K, None), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
          
     # Plot the evolution of the bound during training.
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
     
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
     
     fig.show()
     plt.show()
     
     # Print the top topic words
     topWordCount = 100
     kTopWordInds = [self.topWordInds(dic, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (dic[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
Exemplo n.º 2
0
 def _testLikelihoodOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K, F, P = 200, 100, 10, 12, 8
     tpcs, vocab, docLens, X, W = sampleFromModel(D, T, K, F, P)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     W = W.astype(DTYPE)
     X = X.astype(DTYPE)
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     trainLikely = []
     trainWordCount = []
     queryLikely = []
     queryWordCount = []
     
     for fold in range(folds):
         # Split the datasets
         start = fold * foldSize
         end   = start + trainSize
         
         trainSet = np.arange(start,end) % D
         querySet = np.arange(end, end + querySize) % D
         
         X_train, W_train = X[trainSet,:], W[trainSet,:]
         X_query, W_query = X[querySet,:], W[querySet,:]
         
         # Train the model
         model = stm.newModelAtRandom(X_train, W_train, P, K, 0.1, 0.1, dtype=DTYPE)
         queryState = stm.newQueryState(W_train, model)
         
         plan  = stm.newTrainPlan(iterations=1000, logFrequency=1)
         model, query, (bndItrs, bndVals, bndLikes) = stm.train (W_train, X_train, model, queryState, plan)
         
         # Plot the evolution of the bound during training.
         fig, ax1 = plt.subplots()
         ax1.plot(bndItrs, bndVals, 'b-')
         ax1.set_xlabel('Iterations')
         ax1.set_ylabel('Bound', color='b')
         
         ax2 = ax1.twinx()
         ax2.plot(bndItrs, bndLikes, 'r-')
         ax2.set_ylabel('Likelihood', color='r')
         
         fig.show()
         plt.show()
     
         # Plot the topic covariance
         self._plotCov(model)
         
         # Plot the vocab
         plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
         plt.show()
         
         # Calculating the training set likelihood
         trainLikely.append(stm.log_likelihood(W_train, model, queryState))
         trainWordCount.append(W_train.data.sum())
         
         # Now query the model.
         plan       = stm.newTrainPlan(iterations=1000)
         queryState = stm.newQueryState(W_query, model)
         model, queryState = stm.query(W_query, X_query, model, queryState, plan)
         
         queryLikely.append(stm.log_likelihood(W_query, model, queryState))
         queryWordCount.append(W_query.data.sum())
         
     # Check and print results.
     for fold in range(folds):
         trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
         queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
         
         print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
         print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
         self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
         self.assertTrue(trainPerp < 60.0)
         
     print("End of Test")