Пример #1
0
def make_corpus():
    utils.init()
    interests = list(utils.get_all_interests())
    corpus = MyCorpus(interests)
    corpus.build_interests_to_articles()
    corpus.build_dict()
    corpus.write_corpus()
Пример #2
0
def main():
    utils.init()
    #interests = set(list(utils.get_all_interests())[:50])
    interests = utils.get_all_interests()
    matrix = build_article_adjacencies(interests)
    write_matrix(matrix)
    write_ids_to_indexes()
Пример #3
0
def main():
    utils.init()
    # interests = set(list(utils.get_all_interests())[:50])
    interests = utils.get_all_interests()
    matrix = build_article_adjacencies(interests)
    write_matrix(matrix)
    write_ids_to_indexes()
Пример #4
0
def print_interest_subclusters():
    for i in utils.get_all_interests():
        print i
        g = make_interest_graph(i)
        for j in g['map']:
            print '\t\t%s:' % j
            for k in g['map'][j]:
                print '\t\t\t%s' % k
        print
Пример #5
0
def test_sample_interest_graph():
    for i in random.sample(utils.get_all_interests(), 100):
        print "=" * 80
        print
        print "results for ", i
        make_full_interest_graph(i)
        print
        print
        print
Пример #6
0
def test_sample_interest_graph():
    for i in random.sample(utils.get_all_interests(), 100):
        print '=' * 80
        print
        print 'results for ', i
        make_full_interest_graph(i)
        print
        print
        print
Пример #7
0
def print_interest_subclusters():
    for i in utils.get_all_interests():
        print i
        g = make_interest_graph(i)
        for j in g['map']:
            print '\t\t%s:' % j
            for k in g['map'][j]:
                print '\t\t\t%s' % k
        print
Пример #8
0
def describe_lda():
    utils.init()
    model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
    def article_name(article_id):
        name = utils.get_article_name(article_id)
        return name.encode('ascii', 'ignore') if name else 'unknown'

#    print 'information about topics:'
#    for i in random.sample(range(model.num_topics), 50):
#        print 'topic %d:' % i
#        topic = model.state.get_lambda()[i]
#        topic = topic / topic.sum() # normalize to probability dist
#        for id in numpy.argsort(topic)[::-1][:10]:
#            score = topic[id]
#            article_id = model.id2word[id]
#            print '\t%.6f: %s' % (score, article_name(article_id))

    dictionary = model.id2word
    interests = list(utils.get_all_interests())
    for i in random.sample(interests, 50):
        article_id1 = utils.get_article_id_for_interest(i)
        if not article_id1:
            continue
        doc = make_doc(i, dictionary)

        doc_lda = model[doc]
        doc_lda.sort(key=lambda pair: pair[1])
        doc_lda.reverse()
        sys.stdout.write('topics for %s (article %s):\n' % (i.text, article_name(article_id1)))
        for (topic_id, topic_score) in doc_lda:
            sys.stdout.write('\t%.6f topic %d:' % (topic_score, topic_id))
            topic = model.state.get_lambda()[topic_id]
            topic = topic / topic.sum() # normalize to probability dist
            for id in numpy.argsort(topic)[::-1][:10]:
                score = topic[id]
                article_id = model.id2word[id]
                sys.stdout.write(', ' + article_name(article_id))
            sys.stdout.write('\n')
Пример #9
0
import collections
import logging
import math
import pymongo
import random
import re
import sys

import users
import utils

logging.basicConfig(level=logging.INFO)

utils.init()

#interests = set(list(utils.get_all_interests())[:250])
interests = utils.get_all_interests()
sims = utils.get_correlation_matrix5(interests)
for i1, i1_sims in sims.items():
    for i2, sim in i1_sims.items():
        if sim >= 0.003:
            print '%s=%s %s=%s %s' % (i1.id, i1.text, i2.id, i2.text, sim)
Пример #10
0
import collections
import logging
import math
import pymongo
import random
import re
import sys


import users
import utils

logging.basicConfig(level=logging.INFO)

utils.init()

#interests = set(list(utils.get_all_interests())[:250])
interests = utils.get_all_interests()
sims = utils.get_correlation_matrix5(interests)
for i1, i1_sims in sims.items():
    for i2, sim in i1_sims.items():
        if sim >= 0.003:
            print '%s=%s %s=%s %s' % (i1.id, i1.text, i2.id, i2.text, sim)