예제 #1
0
def make_doc(interest, dictionary):
    article_id1 = utils.get_article_id_for_interest(interest)
    if not article_id1:
        return None
    doc = []
    ranks = utils.get_article_similarity_ranks(article_id1, 2000).items()
    for (article_id2, rank) in ranks:
        if article_id2 in dictionary.token2id:
            id = dictionary.token2id[article_id2]
            score = 1.0 / (math.log(rank + 5) / math.log(2))
            doc.append((id, score))
    return doc
예제 #2
0
def build_article_adjacencies(interests):
    article_sims = collections.defaultdict(list)
    for i in interests:
        article_id = utils.get_article_id_for_interest(i)
        if not article_id:
            continue
        index1 = id_to_index(article_id)
        ranks = utils.get_article_similarity_ranks(article_id, 2000).items()
        ranks.sort(key=lambda pair: pair[1])
        for (article_id2, rank) in ranks:
            article_sims[index1].append(article_id2)

    return article_sims
예제 #3
0
def build_article_adjacencies(interests):
    article_sims = collections.defaultdict(list)
    for i in interests:
        article_id = utils.get_article_id_for_interest(i)
        if not article_id:
            continue
        index1 = id_to_index(article_id)
        ranks = utils.get_article_similarity_ranks(article_id, 2000).items()
        ranks.sort(key=lambda pair: pair[1])
        for (article_id2, rank) in ranks:
            article_sims[index1].append(article_id2)

    return article_sims
예제 #4
0
def describe_lda():
    utils.init()
    model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
    def article_name(article_id):
        name = utils.get_article_name(article_id)
        return name.encode('ascii', 'ignore') if name else 'unknown'

#    print 'information about topics:'
#    for i in random.sample(range(model.num_topics), 50):
#        print 'topic %d:' % i
#        topic = model.state.get_lambda()[i]
#        topic = topic / topic.sum() # normalize to probability dist
#        for id in numpy.argsort(topic)[::-1][:10]:
#            score = topic[id]
#            article_id = model.id2word[id]
#            print '\t%.6f: %s' % (score, article_name(article_id))

    dictionary = model.id2word
    interests = list(utils.get_all_interests())
    for i in random.sample(interests, 50):
        article_id1 = utils.get_article_id_for_interest(i)
        if not article_id1:
            continue
        doc = make_doc(i, dictionary)

        doc_lda = model[doc]
        doc_lda.sort(key=lambda pair: pair[1])
        doc_lda.reverse()
        sys.stdout.write('topics for %s (article %s):\n' % (i.text, article_name(article_id1)))
        for (topic_id, topic_score) in doc_lda:
            sys.stdout.write('\t%.6f topic %d:' % (topic_score, topic_id))
            topic = model.state.get_lambda()[topic_id]
            topic = topic / topic.sum() # normalize to probability dist
            for id in numpy.argsort(topic)[::-1][:10]:
                score = topic[id]
                article_id = model.id2word[id]
                sys.stdout.write(', ' + article_name(article_id))
            sys.stdout.write('\n')
예제 #5
0
 def build_interests_to_articles(self):
     for i in self.interests:
         article_id = utils.get_article_id_for_interest(i)
         if article_id:
             self.mapped_interests.append((i, article_id))