Exemplo n.º 1
0
def runDeltaLDA(data, numsamp, randseed, binarizedOnly):
    """Run the deltaLDA analysis on 'data' based on data generated from
    runsinfo.mat and return the result.

    ARGUMENTS:
    * numsamp - specifies how many samples to take from the Gibbs sampler
    * randseed - is used to initialize the Gibbs sampler random number generator

    """
    result = {}
    if not binarizedOnly:
        (result['phi'], result['theta'],
         result['sample']) = deltaLDA(data['docs'],
                                      data['alpha'],
                                      data['beta'],
                                      numsamp,
                                      randseed,
                                      f=data['Fvector'])

    (result['phi_bin'], result['theta_bin'],
     result['sample_bin']) = deltaLDA(data['bindocs'],
                                      data['alpha'],
                                      data['beta'],
                                      numsamp,
                                      randseed,
                                      f=data['Fvector'])

    def normalize(array, axis):
        """Normalize the entries of the array based onthe axis specified
        """
        array2 = numpy.sum(array, axis)  #sum it along axis
        # TODO: make this applicable for more than 2D arrays
        array2 = array2.reshape((array2.shape[0], 1))
        array2 = array2.repeat(array.shape[1], axis=1)
        return numpy.divide(array, array2)

    def getprzAndprzw(theta, phi):
        prz = numpy.sum(theta, 0)
        prz = prz.reshape((prz.shape[0], 1))
        prz_w = numpy.multiply(phi, prz.repeat(phi.shape[1],
                                               axis=1)).transpose()
        return prz / numpy.sum(prz), normalize(prz_w, 1)

    if not binarizedOnly:
        result['prz'], result['prz_w'] = getprzAndprzw(result['theta'],
                                                       result['phi'])
    result['prz_bin'], result['prz_w_bin'] = getprzAndprzw(
        result['theta_bin'], result['phi_bin'])

    return result
Exemplo n.º 2
0
    def testDelta(self):
        """
        Test DeltaLDA with base data/params + f-values
        """
        our_f = [0, 0, 0, 0, 1, 1]
        alpha = array([[.1, .1, 0],[.1, .1, .1]])
        
        (phi,theta,sample) = deltaLDA(self.docs,alpha,self.beta,
                                      self.numsamp,self.randseed,f=our_f)

        # theta should assign special topic to docs [4,5]
        maxtheta = argmax(theta,axis=1)
        self.assert_(maxtheta[4] == 2)
        self.assert_(maxtheta[5] == 2)
        # theta valid prob matrix
        self.assert_(self.matProb(theta))

        # theta rows should sum to 1
        self.assert_(reduce(lambda x,y: x and y,
                            [abs(val - float(1)) < self.tol
                             for val in theta.sum(axis=1)]))

        # phi for special topic should emph [0]
        maxphi = argmax(phi,axis=1)
        self.assert_(maxphi[2] == 0)
        # phi valid prob matrix
        self.assert_(self.matProb(phi))
Exemplo n.º 3
0
    def testDelta(self):
        """
        Test DeltaLDA with base data/params + f-values
        """
        our_f = [0, 0, 0, 0, 1, 1]
        alpha = array([[.1, .1, 0], [.1, .1, .1]])

        (phi, theta, sample) = deltaLDA(self.docs,
                                        alpha,
                                        self.beta,
                                        self.numsamp,
                                        self.randseed,
                                        f=our_f)

        # theta should assign special topic to docs [4,5]
        maxtheta = argmax(theta, axis=1)
        self.assert_(maxtheta[4] == 2)
        self.assert_(maxtheta[5] == 2)
        # theta valid prob matrix
        self.assert_(self.matProb(theta))

        # theta rows should sum to 1
        self.assert_(
            reduce(
                lambda x, y: x and y,
                [abs(val - float(1)) < self.tol for val in theta.sum(axis=1)]))

        # phi for special topic should emph [0]
        maxphi = argmax(phi, axis=1)
        self.assert_(maxphi[2] == 0)
        # phi valid prob matrix
        self.assert_(self.matProb(phi))
Exemplo n.º 4
0
    def testInit(self):
        """
        Test standard LDA with init from previous sample
        (this doesn't test how the init could affect behavior,
        just checks that using an init doesn't fail completely...)
        """
        # Give stupid init
        (phi, theta, sample) = deltaLDA(self.docs,
                                        self.alpha,
                                        self.beta,
                                        self.numsamp,
                                        self.randseed,
                                        init=self.init)

        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(theta, axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(phi, axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)
Exemplo n.º 5
0
    def testSanity1(self):
        """ Test no constraints vs deltaLDA implementation """
        # Don't even try unless deltaLDA module present
        if (not hasDelta):
            return

        # Randomly generated docs
        docs = [[random.randint(self.W) for i in range(1000)]
                for j in range(100)]

        # Set beta for standard LDA
        ldabeta = self.df.beta * ones((self.T, self.W))

        # Run standard LDA
        (sphi, stheta, ssample) = deltaLDA(docs, self.ldaalpha, ldabeta,
                                           self.numsamp, self.randseed)

        # Run Interactive LDA with empty constraint set
        df = DF.DirichletForest(self.alpha, self.beta, self.eta, self.T,
                                self.W)
        df.inference(docs, self.numsamp, self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi, sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta, stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 6
0
    def testSanity1(self):
        """ Test no constraints vs deltaLDA implementation """
        # Don't even try unless deltaLDA module present
        if(not hasDelta):
            return
        
        # Randomly generated docs
        docs = [[random.randint(self.W) for i in range(1000)]
                for j in range(100)]

        # Set beta for standard LDA
        ldabeta = self.df.beta * ones((self.T,self.W))
        
        # Run standard LDA
        (sphi,stheta,ssample) = deltaLDA(docs,self.ldaalpha,
                                         ldabeta,self.numsamp,self.randseed)

        # Run Interactive LDA with empty constraint set
	df = DF.DirichletForest(self.alpha,self.beta,self.eta,
                                self.T,self.W)
        df.inference(docs,self.numsamp,self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi,sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta,stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 7
0
    def testSanity(self):
        """
        Sanity check online Gibbs init scheme against deltaLDA
        """
        # Don't even try unless deltaLDA module present
        if(not hasDelta):
            return
        
        randseed = 194582

        # Use 1 'online' Gibbs sample to build initial gamma
        gibbs_docs = [[1,1,2],
                      [1,1,1,1,2],
                      [3,3,3,4],
                      [3,3,3,3,4,4],
                      [0,0,0,0,0],
                      [0,0,0,0]]
        numsamp = 0
        (phi,theta,sample) = deltaLDA(gibbs_docs,self.alpha,self.beta,
                                      numsamp,randseed)
        gamma_init = []
        for (d,di) in zip(self.docs_w,range(len(self.docs_w))):
            gamma = zeros((self.T,len(d)))
            for (w,i) in zip(d,range(len(d))):
                gamma[:,i] = theta[di,:] * phi[:,w]
                # normalize
                gamma[:,i] = gamma[:,i] / gamma[:,i].sum()
            # save
            gamma_init.append(gamma)        
        
        # Run cvbLDA with this gamma
        (gphi,gtheta,gamma) = cvbLDA(self.docs_w,self.docs_c,
                                     self.alpha,self.beta,
                                     gamma_init=gamma_init,
                                     maxiter=self.maxiter,
                                     convtol=self.convtol)

        # Run cvbLDA no init gamma, same randseed
        (phi,theta,gamma) = cvbLDA(self.docs_w,self.docs_c,
                                   self.alpha,self.beta,
                                   randseed=randseed,
                                   maxiter=self.maxiter,
                                   convtol=self.convtol)

        self.assert_(self.matAgree(phi,gphi))
        self.assert_(self.matProb(phi))
        self.assert_(self.matProb(gphi))
        
        self.assert_(self.matAgree(theta,gtheta))
        self.assert_(self.matProb(theta))
        self.assert_(self.matProb(gtheta))
Exemplo n.º 8
0
    def testStandard(self):
        """ Test no constraints mode """
        # Don't even try unless deltaLDA module present
        if(not hasDelta):
            return

        # Temporarily shrink vocab
        W = 5

        # Set beta for standard LDA
        ldabeta = self.beta * ones((self.T,W))

        # Run standard LDA
        (sphi,stheta,ssample) = deltaLDA(self.docs,self.ldaalpha,ldabeta,
                                         self.numsamp,self.randseed)

        # Run Interactive LDA with empty constraint set
        df = DF.DirichletForest(self.alpha,self.beta,self.eta,
                                self.T,W)
        df.inference(self.docs,self.numsamp,self.randseed)

        #
        # First, validate correctness of recovered topics
        #
        
        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(df.theta,axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(df.phi,axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi,sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta,stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 9
0
    def testStandard(self):
        """ Test no constraints mode """
        # Don't even try unless deltaLDA module present
        if (not hasDelta):
            return

        # Temporarily shrink vocab
        W = 5

        # Set beta for standard LDA
        ldabeta = self.beta * ones((self.T, W))

        # Run standard LDA
        (sphi, stheta, ssample) = deltaLDA(self.docs, self.ldaalpha, ldabeta,
                                           self.numsamp, self.randseed)

        # Run Interactive LDA with empty constraint set
        df = DF.DirichletForest(self.alpha, self.beta, self.eta, self.T, W)
        df.inference(self.docs, self.numsamp, self.randseed)

        #
        # First, validate correctness of recovered topics
        #

        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(df.theta, axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(df.phi, axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi, sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta, stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 10
0
def start_delta_lda( good_doc_list, bad_doc_list, next_index ):
    docs = good_doc_list + bad_doc_list
    delta_f = []

    for i in range(0, len(good_doc_list)):
        delta_f.append(0)
    
    for i in range(0, len(bad_doc_list)):
        delta_f.append(1)

    delta_alpha = array([[.1, .1, 0],[.1, .1, .1]])
    
    beta = ones((3,next_index))
    numsamp = 200
    randseed = 194582

    (phi,theta,sample) = deltaLDA(docs,delta_alpha,beta,numsamp,randseed,f=delta_f)
    
    return phi,theta,sample
Exemplo n.º 11
0
    def testInit(self):
        """
        Test standard LDA with init from previous sample
        (this doesn't test how the init could affect behavior,
        just checks that using an init doesn't fail completely...)
        """
        # Give stupid init
        (phi,theta,sample) = deltaLDA(self.docs,self.alpha,self.beta,
                                      self.numsamp,self.randseed,init=self.init)

        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(theta,axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(phi,axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)
Exemplo n.º 12
0
    def testStandard(self):
        """
        Test standard LDA with base data/params
        """
        (phi, theta, sample) = deltaLDA(self.docs, self.alpha, self.beta,
                                        self.numsamp, self.randseed)
        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(theta, axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])
        # theta valid prob matrix
        self.assert_(self.matProb(theta))

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(phi, axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)
        # phi valid prob matrix
        self.assert_(self.matProb(phi))
Exemplo n.º 13
0
    def testStandard(self):
        """
        Test standard LDA with base data/params
        """
        (phi,theta,sample) = deltaLDA(self.docs,self.alpha,self.beta,
                                      self.numsamp,self.randseed)
        # theta should clust docs [0,1], [2,3], [4,5]
        maxtheta = argmax(theta,axis=1)
        self.assert_(maxtheta[0] == maxtheta[1])
        self.assert_(maxtheta[2] == maxtheta[3])
        self.assert_(maxtheta[4] == maxtheta[5])
        # theta valid prob matrix
        self.assert_(self.matProb(theta))

        # corresponding phi should emph [1,2], [3,4], [0]
        maxphi = argmax(phi,axis=1)
        self.assert_(maxphi[maxtheta[0]] == 1)
        self.assert_(maxphi[maxtheta[2]] == 3)
        self.assert_(maxphi[maxtheta[4]] == 0)
        # phi valid prob matrix
        self.assert_(self.matProb(phi))
Exemplo n.º 14
0
    def testSanity3(self):
        """ Test beta*eta=X vs deltaLDA with beta=X """
        # Don't even try unless deltaLDA module present
        if(not hasDelta):
            return

        # 'magic' X parameter
        X = 50
        
        # Randomly generated docs
        W = 2
        docs = [[random.randint(W) for i in range(1000)]
                for j in range(100)]
        
        # Set beta for standard LDA
        ldabeta = X * ones((self.T,W))
        
        # Run standard LDA
        (sphi,stheta,ssample) = deltaLDA(docs,self.ldaalpha,
                                         ldabeta,self.numsamp,self.randseed)
        
        # Run Interactive LDA with the baseline (full-tree) constraints,
        # but temporarily set constraint strength to 1 to build tree
        eta = X
        beta = 1
        df = DF.DirichletForest(self.alpha,beta,eta,
                                self.T,W)        
        df.merge([0],[1])
        df.inference(docs,self.numsamp,self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi,sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta,stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 15
0
    def testSanity2(self):
        """ Test eta=1 against deltaLDA implementation """
        # Don't even try unless deltaLDA module present
        if(not hasDelta):
            return

        # Randomly generated docs
        docs = [[random.randint(self.W) for i in range(1000)]
                for j in range(100)]
        
        # Set beta for standard LDA
        ldabeta = self.beta * ones((self.T,self.W))
        
        # Run standard LDA
        start = time.time()
        (sphi,stheta,ssample) = deltaLDA(docs,self.ldaalpha,
                                         ldabeta,self.numsamp,self.randseed)
        ldatime = time.time() - start

        # Run Interactive LDA with the baseline (full-tree) constraints,
        # but temporarily set constraint strength to 1 to build tree
        eta = 1
        df = DF.DirichletForest(self.alpha,self.beta,eta,
                                self.T,self.W)
        df.split([1,2],[3])
        df.split([0],[3])
        df.merge([4],[5])        
        df.inference(docs,self.numsamp,self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi,sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta,stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 16
0
def start_delta_lda(good_doc_list, bad_doc_list, next_index):
    docs = good_doc_list + bad_doc_list
    delta_f = []

    for i in range(0, len(good_doc_list)):
        delta_f.append(0)

    for i in range(0, len(bad_doc_list)):
        delta_f.append(1)

    delta_alpha = array([[.1, .1, 0], [.1, .1, .1]])
    alpha = .1 * ones((1, 3))
    beta = ones((3, next_index))
    numsamp = 200
    randseed = 194582

    (phi, theta, sample) = deltaLDA(docs,
                                    delta_alpha,
                                    beta,
                                    numsamp,
                                    randseed,
                                    f=delta_f)

    return phi, theta, sample
Exemplo n.º 17
0
    def testSanity2(self):
        """ Test eta=1 against deltaLDA implementation """
        # Don't even try unless deltaLDA module present
        if (not hasDelta):
            return

        # Randomly generated docs
        docs = [[random.randint(self.W) for i in range(1000)]
                for j in range(100)]

        # Set beta for standard LDA
        ldabeta = self.beta * ones((self.T, self.W))

        # Run standard LDA
        start = time.time()
        (sphi, stheta, ssample) = deltaLDA(docs, self.ldaalpha, ldabeta,
                                           self.numsamp, self.randseed)
        ldatime = time.time() - start

        # Run Interactive LDA with the baseline (full-tree) constraints,
        # but temporarily set constraint strength to 1 to build tree
        eta = 1
        df = DF.DirichletForest(self.alpha, self.beta, eta, self.T, self.W)
        df.split([1, 2], [3])
        df.split([0], [3])
        df.merge([4], [5])
        df.inference(docs, self.numsamp, self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi, sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta, stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 18
0
    def testSanity3(self):
        """ Test beta*eta=X vs deltaLDA with beta=X """
        # Don't even try unless deltaLDA module present
        if (not hasDelta):
            return

        # 'magic' X parameter
        X = 50

        # Randomly generated docs
        W = 2
        docs = [[random.randint(W) for i in range(1000)] for j in range(100)]

        # Set beta for standard LDA
        ldabeta = X * ones((self.T, W))

        # Run standard LDA
        (sphi, stheta, ssample) = deltaLDA(docs, self.ldaalpha, ldabeta,
                                           self.numsamp, self.randseed)

        # Run Interactive LDA with the baseline (full-tree) constraints,
        # but temporarily set constraint strength to 1 to build tree
        eta = X
        beta = 1
        df = DF.DirichletForest(self.alpha, beta, eta, self.T, W)
        df.merge([0], [1])
        df.inference(docs, self.numsamp, self.randseed)

        # Assert matrix agreement, valid prob dists
        self.assert_(self.matAgree(df.phi, sphi))
        self.assert_(self.matProb(df.phi))
        self.assert_(self.matProb(sphi))

        self.assert_(self.matAgree(df.theta, stheta))
        self.assert_(self.matProb(df.theta))
        self.assert_(self.matProb(stheta))
Exemplo n.º 19
0
# This command will initialize the Gibbs sampler from a user-supplied sample
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,init=sample)

# This command will run standard LDA, but show Gibbs sampler output
# ("Gibbs sample X of Y")
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,verbose=1)

# These commands will run deltaLDA
# (use different alpha vectors for different docs, depending on value of f)
#
delta_f = [0, 1]
delta_alpha = array([[.1,.1, 0],[.1, .1, .1]])
(phi,theta,sample) = deltaLDA(docs,delta_alpha,beta,numsamp,randseed,f=delta_f)




# theta is the matrix of document-topic probabilities
# (estimated from final sample)
# 
# theta = D x T
# theta[di,zj] = P(z=zj | d=di)
#
print ''
print 'Theta - P(z|d)'
print str(theta)
print ''
Exemplo n.º 20
0
def lda(documents_dist,topic_local_to_universal,alpha,beta):
    """ Runs LDA over a set of documents, saving results over a set of predefined topics """

    cursor = connection.cursor()
    n_topics = len(topic_local_to_universal)
    
    word_local_to_universal = {}
    word_universal_to_local = {}
    
    document_local_to_universal = {}
    
    print "Getting document matrix..."

    dic = [word_mapper(map(lambda x: int(str(x),16),document_dist.distribution[:-1].split(',')),word_local_to_universal,word_universal_to_local) for document_dist in documents_dist]
    document_local_to_universal = dict(enumerate([document_dist.document.id for document_dist in documents_dist]))

    n_documents = str(len(dic))
    n_words = len(word_local_to_universal)
    
    print "Numero de documentos: "+str(n_documents)
    print "Numero de palabras: "+str(n_words)
    
    if int(n_documents) == 0:
        raise Exception('LDAmodel has no documents assigned or the documents had only irrelevant words. No document matrix founded.')
    
    f_label = 1
    numsamp = 50
    randseed = 194582

    alpha_vector = alpha * ones((f_label,n_topics))
    beta_vector = beta * ones((n_topics,n_words)) 

    print "Calculating LDA using..."
    print "   beta: "+str(beta)
    print "   alpha: "+str(alpha)
    print "   ntopics: "+str(n_topics)

    (phi,theta,sample) = deltaLDA(dic,alpha_vector,beta_vector,numsamp,randseed)
    print "Saving Results..."
    
    ########################
    #    document_topic
    ########################
              
    print "Saving Document and topic correlation..."
    document_local_id = 0
    goal = 0
    current = 0
    theta_len = len(theta)
    for d in theta:
        st = "INSERT INTO application_documenttopic (document_id, topic_id, value) VALUES "
        goal, current = avance(current, theta_len, goal)
        topic_local_id = 0
        for document_weight in d:
            st = st + "("+str(document_local_to_universal[document_local_id])+","+str(topic_local_to_universal[topic_local_id])+","+str(document_weight)+"),"
            topic_local_id += 1
        st = st[:-1]+";"
        cursor.execute(st)
        cursor.execute("COMMIT")
        document_local_id += 1
    
    #####################          
    #    topic_word
    #####################
    
    print "Saving topics and word correlation to file"
    topic_local_id = 0
    goal = 0
    current = 0
    phi_len = len(phi)
    nbest = int(n_words*0.5)


    os.system("touch /tmp/application_topicword.txt")
    os.system("chmod 777 /tmp/application_topicword.txt")
    FILE = '/tmp/application_topicword.txt'
    print 'Opening %s' % FILE
    fw = open (FILE,'w')
    
    for t in phi:
        goal, current = avance(current, phi_len, goal)
        word_local_id = 0
        for word_weight in t:
            fw.write(str(topic_local_to_universal[topic_local_id])+';'+str(word_local_to_universal[word_local_id])+';'+str(word_weight)+'\n')
            word_local_id += 1
        topic_local_id += 1

    fw.close()
    
    load_data_in_file()
    
    return True
Exemplo n.º 21
0
docs = [[1,1,2],
        [1,1,1,1,2],
        [3,3,3,4],
        [3,3,4,4,3,3],
        [0,0,0,0,0],
        [0,0,0,0]]

# numsamp specifies how many samples to take from the Gibbs sampler
numsamp = 50

# randseed is used to initialize the Gibbs sampler random number generator
randseed = 194582

# This command will run the standard LDA model
#
(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed)

# This command will initialize the Gibbs sampler from a user-supplied sample
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,init=sample)

# This command will run standard LDA, but show Gibbs sampler output
# ("Gibbs sample X of Y")
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,verbose=1)

# These commands will run deltaLDA
# (use different alpha vectors for different docs, depending on value of f)
#
#delta_f = [0, 0, 0, 0, 1, 1]
#delta_alpha = array([[.1, .1, 0],[.1, .1, .1]])
Exemplo n.º 22
0
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,init=sample)

# This command will run standard LDA, but show Gibbs sampler output
# ("Gibbs sample X of Y")
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,verbose=1)

# These commands will run deltaLDA
# (use different alpha vectors for different docs, depending on value of f)
#
delta_f = [0, 1]
delta_alpha = array([[.1, .1, 0], [.1, .1, .1]])
(phi, theta, sample) = deltaLDA(docs,
                                delta_alpha,
                                beta,
                                numsamp,
                                randseed,
                                f=delta_f)

# theta is the matrix of document-topic probabilities
# (estimated from final sample)
#
# theta = D x T
# theta[di,zj] = P(z=zj | d=di)
#
print ''
print 'Theta - P(z|d)'
print str(theta)
print ''

# phi is the matrix of topic-word probabilities
Exemplo n.º 23
0
f = open('d', 'r')
content = f.readlines()
docs = []
for line in content:
    l = [int(item) for item in line.split(' ')]
    docs.append(l)

# numsamp specifies how many samples to take from the Gibbs sampler
numsamp = 50

# randseed is used to initialize the Gibbs sampler random number generator
randseed = 194582

# This command will run the standard LDA model
#
(phi, theta, sample) = deltaLDA(docs, alpha, beta, numsamp, randseed)

# This command will initialize the Gibbs sampler from a user-supplied sample
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,init=sample)

# This command will run standard LDA, but show Gibbs sampler output
# ("Gibbs sample X of Y")
#
#(phi,theta,sample) = deltaLDA(docs,alpha,beta,numsamp,randseed,verbose=1)

# These commands will run deltaLDA
# (use different alpha vectors for different docs, depending on value of f)
#
#delta_f = [0, 0, 0, 0, 1, 1]
#delta_alpha = array([[.1, .1, 0],[.1, .1, .1]])