Exemplo n.º 1
0
def main():
    """
    Analyzes scraped pages using scikit-learn.LDA
    """
    
    # The number of topics
    K = 10
    # no of documents
    D = 300
    n_features = 1000

    # Our vocabulary
    vocab = list(set(file('./vocab').readlines()))
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    # grab documents
    ### Load your scraped pages, re-tokenize, and vectorize result.
    docset, docnames = [], []
    for filename in os.listdir(os.getcwd()):
        if filename.endswith('.html'): 
            tree = html.parse(filename)
            try: encoding = tree.xpath('//meta/@charset')[0]
            except IndexError: encoding = 'utf-8'

            with open(filename) as page:
                rawtext = page.read()
                try: rawtext = rawtext.decode(encoding, errors='backslashreplace')
                except TypeError: continue
                # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion
                docset += [clean_html(rawtext)]
                docnames += [filename[:-5]]
                if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents")

    # Give them to online LDA
    # Also computes an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    (gamma, bound) = olda.update_lambda(wordids, wordcts)

    
    # Arrays for adding batches of data to the DB
    # doc_array = []
    # doc_term_array = []

    # for d in range(len(docnames)):
        # doc_array.append((docnames[d], docset[d]))
    doc_array = zip(docnames, docset)
        
    # Add a batch of docs to the DB; this is the one DB task that is not in
    # the separate DB write thread since later tasks depend on having doc ids.
    # Since writes take so long, this also balaces the two threads time-wise.
    doc_ids = db.add_docs(doc_array)

    doc_topic_array = []
    for d in range(len(gamma)):
        doc_size = len(docset[d])
        for k in range(len(gamma[d])):
            doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
    db.add_doc_topics(doc_topic_array)

    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (1, olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    numpy.savetxt('lambda-%d.dat' % 1, olda._lambda)
    numpy.savetxt('gamma-%d.dat' % 1, gamma)
        
    topic_terms_array = []
    for topic in range(len(olda._lambda)):
        lambda_sum = sum(olda._lambda[topic])
            
        for term in range(len(olda._lambda[topic])):
            topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
    db.update_topic_terms(K, topic_terms_array)
            
    gc.collect() # probably not necesary, but precautionary for long runs
    db.print_task_update()

    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.increment_batch_count()
    db.signal_end()
Exemplo n.º 2
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        
        # Arrays for adding batches of data to the DB
        doc_array = []
        doc_term_array = []
        
        for d in range(len(articlenames)):
            doc_array.append((articlenames[d], docset[d]))
        
        # Add a batch of docs to the DB; this is the one DB task that is not in
        # the separate DB write thread since later tasks depend on having doc ids.
        # Since writes take so long, this also balaces the two threads time-wise.
        doc_ids = db.add_docs(doc_array)
	
        doc_topic_array = []
        for d in range(len(gamma)):
            doc_size = len(docset[d])
            for k in range(len(gamma[d])):
                doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
        db.add_doc_topics(doc_topic_array)

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
            
            topic_terms_array =[]
            for topic in range(len(olda._lambda)):
                lambda_sum = sum(olda._lambda[topic])
                
                for term in range(len(olda._lambda[topic])):
                    topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
            db.update_topic_terms(K, topic_terms_array)
                
            gc.collect() # probably not necesary, but precautionary for long runs
            db.print_task_update()
        db.increment_batch_count()
    
    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.signal_end()
Exemplo n.º 3
0
    db.add_terms(vocab)
    print "adding",K,"topics"
    db.add_topics(K)


    # write out the final topics to the db
    print "writing out final topics to tmv db"
    for topic in range(len(olda._lambda)):
        topic_terms_array = []
        lambda_sum = sum(olda._lambda[topic])

        for term in range(len(olda._lambda[topic])):
            topic_terms_array.append((term, \
                olda._lambda[topic][term]/lambda_sum))

        db.update_topic_terms(topic, topic_terms_array)


    # do a final pass over all documents
    print "doing a final E step over all documents"
    per_time = dict()
    i = 0
    import time
    s = time.time()
    D = 1850000 #TODO: this should be read in from settings
    for filename, alltxt, title, subtitle in docgen:
        length = 0
        for word in alltxt.split():
            if word in vocab:
                length += 1