Пример #1
0
    def testCrossValPerplexityOnRealDataWithCtmInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = ctm.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=False)

        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(1): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = ctm.newModelAtRandom(trainData, K, dtype=dtype)
                query = ctm.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = ctm.train (trainData, model, query, trainPlan)

                like = ctm.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                query = ctm.newQueryState(queryData, model)
                model, queryResult = ctm.query(queryData, model, query, queryPlan)

                like = ctm.log_likelihood(queryData, model, queryResult)
                perp = perplexity_from_like(like, queryData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / NumFolds)
            queryPerps.append(sum(queryPerps) / NumFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
Пример #2
0
 def _testOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count
     tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K)
     
     W = W.astype(DTYPE)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for useDiagonalPriorCov in [False, True]:
         trainLikely = []
         trainWordCount = []
         queryLikely = []
         queryWordCount = []
         
         for fold in range(folds):
             # Split the datasets
             start = fold * foldSize
             end   = start + trainSize
             
             trainSet = np.arange(start,end) % D
             querySet = np.arange(end, end + querySize) % D
             
             W_train = W[trainSet,:]
             W_query = W[querySet,:]
             
             # Train the model
             model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE)
             queryState = ctm.newQueryState(W_train, model)
             
             plan  = ctm.newTrainPlan(iterations=20, logFrequency=1, fastButInaccurate=useDiagonalPriorCov)
             model, queryState, (bndItrs, bndVals) = ctm.train (W_train, None, model, queryState, plan)
                 
             # Plot the evoluation of the bound during training.
             plt.plot(bndItrs[5:], bndVals[5:])
             plt.xlabel("Iterations")
             plt.ylabel("Variational Bound")
             plt.show()
         
             # Plot the topic covariance
             self._plotCov(model)
             
             # Plot the vocab
             plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
             plt.show()
             
             # Calculating the training set likelihood
             trainLikely.append(ctm.log_likelihood(W_train, model, queryState))
             trainWordCount.append(W_train.data.sum())
             
             # Now query the model.
             plan       = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov)
             queryState = ctm.newQueryState(W_query, model)
             model, queryState = ctm.query(W_query, None, model, queryState, plan)
             
             queryLikely.append(ctm.log_likelihood(W_query, model, queryState))
             queryWordCount.append(W_query.data.sum())
          
         # Print out the likelihood and perplexity for each fold.   
         print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances")
         for fold in range(folds):
             trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
             queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
             
             print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
             print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
             self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
             self.assertTrue(trainPerp < 60.0)
         print ("\n\n")
         
     print("End of Test")