Exemplo n.º 1
0
 def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
     print ("Fully handcrafted example, K=Q")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 6   # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     lmda = np.zeros((D,K))
     X    = np.zeros((D,F))
     for d in range(D):
         for _ in range(3):
             lmda[d,rd.randint(K)] += 1./3
         for _ in range(int(F/3)):
             X[d,rd.randint(F)] += 1
     
     A = rd.random((K,F))
     X = lmda.dot(la.pinv(A).T)
     X = ssp.csr_matrix(X)
     
     tpcs = lmda
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 100, iterations=130)
     tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print("End of Test")
Exemplo n.º 2
0
 def testLikelihoodOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     modelState, _, _, _, X, W = self._sampleFromModel()
     D, T, K, Q, F, P = X.shape[0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for fold in range(folds):
         start = fold * foldSize
         end   = start + trainSize
         
         trainSet = np.arange(start,end) % D
         querySet = np.arange(end, end + querySize) % D
         
         X_train, W_train = X[trainSet,:], W[trainSet,:]
         X_query, W_query = X[querySet,:], W[querySet,:]
         
         modelState = newVbModelState(K, Q, F, P, T)
         modelState, queryState = train(modelState, X_train, W_train, iterations=100, logInterval=10, plotInterval=100)
         trainSetLikely = log_likelihood(modelState, X_train, W_train, queryState)
         
         queryState = query(modelState, X_query, W_query, iterations=50, epsilon=0.001, logInterval = 10, plotInterval = 100)
         querySetLikely = log_likelihood(modelState, X_query, W_query, queryState)
         
         print("Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainSetLikely, querySetLikely))
         print("")
         
     print("End of Test")
Exemplo n.º 3
0
 def _testInferenceFromHandcraftedExample(self):
     print ("Partially hand-crafted example")
     rd.seed(0xC0FFEE) # Global init for repeatable test
     
     T = 100 # Vocabulary size, the number of "terms". Must be a square number
     Q = 6   # Topics: This cannot be changed without changing the code that generates the vocabulary
     K = 10  # Observed topics
     P = 8   # Features
     F = 12  # Observed features
     D = 200 # Sample documents (each with associated features) 
     
     avgWordsPerDoc = 500
     
     # Determine what A, U, Y and V should be
     U = rd.random((K,Q))
     Y = rd.random((Q,P))
     V = rd.random((F,P))
     A = U.dot(Y).dot(V.T)
     
     # The vocabulary. Presented graphically there are two with horizontal bands
     # (upper lower); two with vertical bands (left, right);  and two with 
     # horizontal bands (inside, outside)
     vocab = makeSixTopicVocab(T)
     
     # Create our (sparse) features X, then our topic proportions ("tpcs")
     # then our word counts W
     X_low = np.array([1 if rd.random() < 0.3 else 0 for _ in range(D*P)]).reshape(D,P)
     X     = ssp.csr_matrix(X_low.dot(V.T))
     
     lmda_low = X_low.dot(Y.T)
     print ("lmda_low.mean() = %f" % (lmda_low.mean()))
     tpcs = rowwise_softmax (lmda_low)
     
     docLens = rd.poisson(avgWordsPerDoc, (D,))
     W = tpcs.dot(vocab)
     W *= docLens[:, np.newaxis]
     W = np.array(W, dtype=np.int32) # truncate word counts to integers
     W = ssp.csr_matrix(W)
     
     #
     # Now finally try to train the model
     #
     modelState = newVbModelState(K, Q, F, P, T)
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 10, iterations=10)
     
     tpcs_inf = rowwise_softmax(queryState.lmda)
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
             
     print("Handcrafted Test-Case")
     print("=====================================================================")
     print("Average, squared, per-element difference between true and estimated:")
     print("    Topic Distribution:    %f" % (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs),))
     print("    Vocab Distribution:    %f" % (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) / len(vocab),))
     print("Average absolute difference between true and reconstructed documents")
     print("    Documents:             %f" % (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()),))
     
     
     print("End of Test")
Exemplo n.º 4
0
    def _testInferenceOnModelDerivedData(self):
        print("Model derived example")

        rd.seed(0xBADB055)  # Global init for repeatable test
        modelState, tpcs, _, _, X, W = self._sampleFromModel()
        D = X.shape[0]

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))  # why safe?
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=50,
                                           iterations=200)
        tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print(str(tpcs[4, :]))
        print(str(tpcs_inf[4, :]))

        print("End of Test")
Exemplo n.º 5
0
    def testLikelihoodOnModelDerivedExample(self):
        print("Cross-validated likelihoods on model-derived example")

        rd.seed(0xBADB055)  # Global init for repeatable test
        modelState, _, _, _, X, W = self._sampleFromModel()
        D, T, K, Q, F, P = X.shape[
            0], modelState.T, modelState.K, modelState.Q, modelState.F, modelState.P

        # Create the cross-validation folds
        folds = 5
        foldSize = ceil(D / 5)
        querySize = foldSize
        trainSize = D - querySize

        for fold in range(folds):
            start = fold * foldSize
            end = start + trainSize

            trainSet = np.arange(start, end) % D
            querySet = np.arange(end, end + querySize) % D

            X_train, W_train = X[trainSet, :], W[trainSet, :]
            X_query, W_query = X[querySet, :], W[querySet, :]

            modelState = newVbModelState(K, Q, F, P, T)
            modelState, queryState = train(modelState,
                                           X_train,
                                           W_train,
                                           iterations=100,
                                           logInterval=10,
                                           plotInterval=100)
            trainSetLikely = log_likelihood(modelState, X_train, W_train,
                                            queryState)

            queryState = query(modelState,
                               X_query,
                               W_query,
                               iterations=50,
                               epsilon=0.001,
                               logInterval=10,
                               plotInterval=100)
            querySetLikely = log_likelihood(modelState, X_query, W_query,
                                            queryState)

            print(
                "Fold %d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f"
                % (fold, trainSetLikely, querySetLikely))
            print("")

        print("End of Test")
Exemplo n.º 6
0
 def _testInferenceOnModelDerivedData(self):
     print("Model derived example")
     
     rd.seed(0xBADB055) # Global init for repeatable test
     modelState, tpcs, _, _, X, W = self._sampleFromModel()
     D = X.shape[0]
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, iterations=1)
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda)) # why safe?
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     priorReconsError = np.sum(np.square(W - W_inf)) / D
     
     (trainedState, queryState) = train (modelState, X, W, logInterval=1, plotInterval = 50, iterations=200)
     tpcs_inf = rowwise_softmax(np.log(queryState.expLmda))
     W_inf    = np.array(tpcs_inf.dot(trainedState.vocab) * queryState.docLen[:,np.newaxis], dtype=np.int32)
     
     print ("Model Driven: Prior Reconstruction Error: %f" % (priorReconsError,))
     print ("Model Driven: Final Reconstruction Error: %f" % (np.sum(np.square(W - W_inf)) / D,))
     
     print (str(tpcs[4,:]))
     print (str(tpcs_inf[4,:]))
     
     print("End of Test")       
Exemplo n.º 7
0
    def _testInferenceFromHandcraftedExampleWithKEqualingQ(self):
        print("Fully handcrafted example, K=Q")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 6  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        lmda = np.zeros((D, K))
        X = np.zeros((D, F))
        for d in range(D):
            for _ in range(3):
                lmda[d, rd.randint(K)] += 1. / 3
            for _ in range(int(F / 3)):
                X[d, rd.randint(F)] += 1

        A = rd.random((K, F))
        X = lmda.dot(la.pinv(A).T)
        X = ssp.csr_matrix(X)

        tpcs = lmda

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           iterations=1)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)
        priorReconsError = np.sum(np.square(W - W_inf)) / D

        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=100,
                                           iterations=130)
        tpcs_inf = rowwise_softmax(safe_log(queryState.expLmda))
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Model Driven: Prior Reconstruction Error: %f" %
              (priorReconsError, ))
        print("Model Driven: Final Reconstruction Error: %f" %
              (np.sum(np.square(W - W_inf)) / D, ))

        print("End of Test")
Exemplo n.º 8
0
    def _testInferenceFromHandcraftedExample(self):
        print("Partially hand-crafted example")
        rd.seed(0xC0FFEE)  # Global init for repeatable test

        T = 100  # Vocabulary size, the number of "terms". Must be a square number
        Q = 6  # Topics: This cannot be changed without changing the code that generates the vocabulary
        K = 10  # Observed topics
        P = 8  # Features
        F = 12  # Observed features
        D = 200  # Sample documents (each with associated features)

        avgWordsPerDoc = 500

        # Determine what A, U, Y and V should be
        U = rd.random((K, Q))
        Y = rd.random((Q, P))
        V = rd.random((F, P))
        A = U.dot(Y).dot(V.T)

        # The vocabulary. Presented graphically there are two with horizontal bands
        # (upper lower); two with vertical bands (left, right);  and two with
        # horizontal bands (inside, outside)
        vocab = makeSixTopicVocab(T)

        # Create our (sparse) features X, then our topic proportions ("tpcs")
        # then our word counts W
        X_low = np.array([1 if rd.random() < 0.3 else 0
                          for _ in range(D * P)]).reshape(D, P)
        X = ssp.csr_matrix(X_low.dot(V.T))

        lmda_low = X_low.dot(Y.T)
        print("lmda_low.mean() = %f" % (lmda_low.mean()))
        tpcs = rowwise_softmax(lmda_low)

        docLens = rd.poisson(avgWordsPerDoc, (D, ))
        W = tpcs.dot(vocab)
        W *= docLens[:, np.newaxis]
        W = np.array(W, dtype=np.int32)  # truncate word counts to integers
        W = ssp.csr_matrix(W)

        #
        # Now finally try to train the model
        #
        modelState = newVbModelState(K, Q, F, P, T)
        (trainedState, queryState) = train(modelState,
                                           X,
                                           W,
                                           logInterval=1,
                                           plotInterval=10,
                                           iterations=10)

        tpcs_inf = rowwise_softmax(queryState.lmda)
        W_inf = np.array(tpcs_inf.dot(trainedState.vocab) *
                         queryState.docLen[:, np.newaxis],
                         dtype=np.int32)

        print("Handcrafted Test-Case")
        print(
            "====================================================================="
        )
        print(
            "Average, squared, per-element difference between true and estimated:"
        )
        print("    Topic Distribution:    %f" %
              (np.sum(np.square(tpcs.dot(U.T) - tpcs_inf)) / len(tpcs), ))
        print("    Vocab Distribution:    %f" %
              (np.sum(np.square(U.dot(vocab) - trainedState.vocab)) /
               len(vocab), ))
        print(
            "Average absolute difference between true and reconstructed documents"
        )
        print("    Documents:             %f" %
              (np.sum(np.abs(W.todense() - W_inf)) / np.sum(W.todense()), ))

        print("End of Test")