Пример #1
0
 def _doTest (self, W, X, model, queryState, trainPlan):
     D,_ = W.shape
     recons = queryState.means.dot(model.vocab)
     reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons))
     
     print ("Initial bound is %f\n\n" % ctm.var_bound(W, model, queryState))
     print ("Initial reconstruction error is %f\n\n" % reconsErr)
     
     model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan)
         
     # Plot the evolution of the bound during training.
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
     
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
     
     fig.show()
     plt.show()
     
     # Plot the vocabulary
     ones = np.ones((3,3))
     for k in range(model.K):
         plt.subplot(2, 3, k)
         plt.imshow(ones - model.vocab[k,:].reshape((3,3)), interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     recons = queryState.means.dot(model.vocab)
     reconsErr = 1./D * np.sum((np.asarray(W.todense()) - recons) * (np.asarray(W.todense()) - recons))
     print ("Final reconstruction error is %f\n\n" % reconsErr)
Пример #2
0
    def testOnRealData(self):
        rd.seed(0xDAFF0D12)
        
#        path = "/Users/bryanfeeney/Desktop/NIPS"
#        with open(path + "/ar.pkl", "rb") as f:
#            X, W, _, dic = pkl.load(f)
        
        path = "/Users/bryanfeeney/Desktop/SmallerDB-NoCJK-WithFeats-Fixed"
        with open(path + "/all-in-one.pkl", "rb") as f:
            W, X, dic = pkl.load(f)
        
        if W.dtype != DTYPE:
            W = W.astype(DTYPE)
        if X.dtype != DTYPE:
            X = X.astype(DTYPE)
        
        D,T = W.shape
        _,F = X.shape
        
        freq = np.squeeze(np.asarray(W.sum(axis=0)))
        scale = np.reciprocal(1. + freq)
        
        K = 10
        P = 5
        model      = stm.newModelAtRandom(X, W, P, K, 0.1, 0.1, dtype=DTYPE)
        queryState = stm.newQueryState(W, model)
        trainPlan  = stm.newTrainPlan(iterations=50, logFrequency=1, debug=True)
        
        model, query, (bndItrs, bndVals, bndLikes) = stm.train (W, X, model, queryState, trainPlan)
        with open(newModelFile("stm-yv-bohn-nips-ar", K, None), "wb") as f:
            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
             
        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')
        
        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')
        
        fig.show()
        plt.show()
        
        # Print the top words
        topWordCount = 100
        kTopWordInds = [self.topWordInds(dic, model.vocab[k,:] * scale, topWordCount) \
                        for k in range(K)]
        
        print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
        print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
        print ("\n".join ("\t".join (dic[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
Пример #3
0
    def _testLikelihoodOnModelDerivedExample(self):
        print("Cross-validated likelihoods on model-derived example")
        
        rd.seed(0xBADB055) # Global init for repeatable test
        D, T, K, F, P = 200, 100, 10, 12, 8
        tpcs, vocab, docLens, X, W = self._sampleFromModel()
        
        plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
        plt.show()
        
        W = W.astype(DTYPE)
        X = X.astype(DTYPE)
        
        # Create the cross-validation folds
        folds     = 5
        foldSize  = ceil(D / 5)
        querySize = foldSize
        trainSize = D - querySize
        
        trainLikely = []
        trainWordCount = []
        queryLikely = []
        queryWordCount = []
        
        for fold in range(folds):
            # Split the datasets
            start = fold * foldSize
            end   = start + trainSize
            
            trainSet = np.arange(start,end) % D
            querySet = np.arange(end, end + querySize) % D
            
            X_train, W_train = X[trainSet,:], W[trainSet,:]
            X_query, W_query = X[querySet,:], W[querySet,:]
            
            # Train the model
            model = stm.newModelAtRandom(X_train, W_train, P, K, 0.1, 0.1, dtype=DTYPE)
            queryState = stm.newQueryState(W_train, model)
            
            plan  = stm.newTrainPlan(iterations=100, logFrequency=1)
            model, query, (bndItrs, bndVals, bndLikes) = stm.train (W_train, X_train, model, queryState, plan)
                
            # Plot the evolution of the bound during training.
            fig, ax1 = plt.subplots()
            ax1.plot(bndItrs, bndVals, 'b-')
            ax1.set_xlabel('Iterations')
            ax1.set_ylabel('Bound', color='b')
            
            ax2 = ax1.twinx()
            ax2.plot(bndItrs, bndLikes, 'r-')
            ax2.set_ylabel('Likelihood', color='r')
            
            fig.show()
            plt.show()
        
            # Plot the topic covariance
            self._plotCov(model)
            
            # Plot the vocab
            plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
            plt.show()
            
            # Calculating the training set likelihood
            trainLikely.append(stm.log_likelihood(W_train, model, queryState))
            trainWordCount.append(W_train.data.sum())
            
            # Now query the model.
            plan       = stm.newTrainPlan(iterations=100)
            queryState = stm.newQueryState(W_query, model)
            model, queryState = stm.query(W_query, X_query, model, queryState, plan)
            
            queryLikely.append(stm.log_likelihood(W_query, model, queryState))
            queryWordCount.append(W_query.data.sum())
            
        for fold in range(folds):
            trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
            queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
            
            print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
            print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
        
            self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
            self.assertTrue(trainPerp < 60.0)

        print("End of Test")