Exemplo n.º 1
0
    def __init__(self, word_id2tag_id):
        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
        self.tag2id = WordId().fromfile(join(DATA_DIR, 'tag2id'))
        self.word2id = WordId().fromfile(join(DATA_DIR, 'word2id'))
        self.db = DbKyoto('test.kch')

        for word_id_list, tag_id_list in word_id2tag_id:
            for tag_id in tag_id_list:
                topic_id_title_count[tag_id] += 1
                for word_id in word_id_list:
                    word_topic_count[word_id][tag_id] += 1
Exemplo n.º 2
0
    def __init__(self, word_id2tag_id):
        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
        word_topic_count = self.word_topic_count = defaultdict(lambda: defaultdict(int))
        self.tag2id = WordId().fromfile(join(DATA_DIR, "tag2id"))
        self.word2id = WordId().fromfile(join(DATA_DIR, "word2id"))
        self.db = DbKyoto("test.kch")

        for word_id_list, tag_id_list in word_id2tag_id:
            for tag_id in tag_id_list:
                topic_id_title_count[tag_id] += 1
                for word_id in word_id_list:
                    word_topic_count[word_id][tag_id] += 1
Exemplo n.º 3
0
class BayesRank(object):
    def __init__(self, word_id2tag_id):
        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
        word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int))
        self.tag2id = WordId().fromfile(join(DATA_DIR, 'tag2id'))
        self.word2id = WordId().fromfile(join(DATA_DIR, 'word2id'))
        self.db = DbKyoto('test.kch')

        for word_id_list, tag_id_list in word_id2tag_id:
            for tag_id in tag_id_list:
                topic_id_title_count[tag_id] += 1
                for word_id in word_id_list:
                    word_topic_count[word_id][tag_id] += 1

    def rank(self):
        print 'Ranking'
        topic_id_title_count = self.topic_id_title_count
        word_topic_count = self.word_topic_count


        #word_topic_bayes = {}
        for word, topic_count in word_topic_count.iteritems():
            word_topic_freq = {}
            word_doc_count = WORD_DOC_COUNT.get(word)
            x = 1/float(word_doc_count)
            for topic_id, count in topic_count.iteritems():

                word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id))

                topic2title = topic_id_title_count[topic_id]
                if topic2title < 10:
                    continue
                if word_topic_id != topic_id:
                    word_topic_freq[topic_id] = (count+1)/float(topic2title+word_doc_count) - x

                else:
                    word_topic_freq[topic_id] = 1

            count = sum(word_topic_freq.itervalues())
            self.db.set((word, [(k, (v+x)/(count+x*len(topic_count))) for k, v in word_topic_freq.iteritems()]))
Exemplo n.º 4
0
class BayesRank(object):
    def __init__(self, word_id2tag_id):
        topic_id_title_count = self.topic_id_title_count = defaultdict(int)
        word_topic_count = self.word_topic_count = defaultdict(lambda: defaultdict(int))
        self.tag2id = WordId().fromfile(join(DATA_DIR, "tag2id"))
        self.word2id = WordId().fromfile(join(DATA_DIR, "word2id"))
        self.db = DbKyoto("test.kch")

        for word_id_list, tag_id_list in word_id2tag_id:
            for tag_id in tag_id_list:
                topic_id_title_count[tag_id] += 1
                for word_id in word_id_list:
                    word_topic_count[word_id][tag_id] += 1

    def rank(self):
        print "Ranking"
        topic_id_title_count = self.topic_id_title_count
        word_topic_count = self.word_topic_count

        # word_topic_bayes = {}
        for word, topic_count in word_topic_count.iteritems():
            word_topic_freq = {}
            word_doc_count = WORD_DOC_COUNT.get(word)
            x = 1 / float(word_doc_count)
            for topic_id, count in topic_count.iteritems():

                word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id))

                topic2title = topic_id_title_count[topic_id]
                if topic2title < 10:
                    continue
                if word_topic_id != topic_id:
                    word_topic_freq[topic_id] = (count + 1) / float(topic2title + word_doc_count) - x

                else:
                    word_topic_freq[topic_id] = 1

            count = sum(word_topic_freq.itervalues())
            self.db.set((word, [(k, (v + x) / (count + x * len(topic_count))) for k, v in word_topic_freq.iteritems()]))