def __init__(self, word_id2tag_id): topic_id_title_count = self.topic_id_title_count = defaultdict(int) word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int)) self.tag2id = WordId().fromfile(join(DATA_DIR, 'tag2id')) self.word2id = WordId().fromfile(join(DATA_DIR, 'word2id')) self.db = DbKyoto('test.kch') for word_id_list, tag_id_list in word_id2tag_id: for tag_id in tag_id_list: topic_id_title_count[tag_id] += 1 for word_id in word_id_list: word_topic_count[word_id][tag_id] += 1
def __init__(self, word_id2tag_id): topic_id_title_count = self.topic_id_title_count = defaultdict(int) word_topic_count = self.word_topic_count = defaultdict(lambda: defaultdict(int)) self.tag2id = WordId().fromfile(join(DATA_DIR, "tag2id")) self.word2id = WordId().fromfile(join(DATA_DIR, "word2id")) self.db = DbKyoto("test.kch") for word_id_list, tag_id_list in word_id2tag_id: for tag_id in tag_id_list: topic_id_title_count[tag_id] += 1 for word_id in word_id_list: word_topic_count[word_id][tag_id] += 1
class BayesRank(object): def __init__(self, word_id2tag_id): topic_id_title_count = self.topic_id_title_count = defaultdict(int) word_topic_count = self.word_topic_count = defaultdict(lambda:defaultdict(int)) self.tag2id = WordId().fromfile(join(DATA_DIR, 'tag2id')) self.word2id = WordId().fromfile(join(DATA_DIR, 'word2id')) self.db = DbKyoto('test.kch') for word_id_list, tag_id_list in word_id2tag_id: for tag_id in tag_id_list: topic_id_title_count[tag_id] += 1 for word_id in word_id_list: word_topic_count[word_id][tag_id] += 1 def rank(self): print 'Ranking' topic_id_title_count = self.topic_id_title_count word_topic_count = self.word_topic_count #word_topic_bayes = {} for word, topic_count in word_topic_count.iteritems(): word_topic_freq = {} word_doc_count = WORD_DOC_COUNT.get(word) x = 1/float(word_doc_count) for topic_id, count in topic_count.iteritems(): word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id)) topic2title = topic_id_title_count[topic_id] if topic2title < 10: continue if word_topic_id != topic_id: word_topic_freq[topic_id] = (count+1)/float(topic2title+word_doc_count) - x else: word_topic_freq[topic_id] = 1 count = sum(word_topic_freq.itervalues()) self.db.set((word, [(k, (v+x)/(count+x*len(topic_count))) for k, v in word_topic_freq.iteritems()]))
class BayesRank(object): def __init__(self, word_id2tag_id): topic_id_title_count = self.topic_id_title_count = defaultdict(int) word_topic_count = self.word_topic_count = defaultdict(lambda: defaultdict(int)) self.tag2id = WordId().fromfile(join(DATA_DIR, "tag2id")) self.word2id = WordId().fromfile(join(DATA_DIR, "word2id")) self.db = DbKyoto("test.kch") for word_id_list, tag_id_list in word_id2tag_id: for tag_id in tag_id_list: topic_id_title_count[tag_id] += 1 for word_id in word_id_list: word_topic_count[word_id][tag_id] += 1 def rank(self): print "Ranking" topic_id_title_count = self.topic_id_title_count word_topic_count = self.word_topic_count # word_topic_bayes = {} for word, topic_count in word_topic_count.iteritems(): word_topic_freq = {} word_doc_count = WORD_DOC_COUNT.get(word) x = 1 / float(word_doc_count) for topic_id, count in topic_count.iteritems(): word_topic_id = self.word2id.get_id_by_tag(self.tag2id.get_word_by_id(topic_id)) topic2title = topic_id_title_count[topic_id] if topic2title < 10: continue if word_topic_id != topic_id: word_topic_freq[topic_id] = (count + 1) / float(topic2title + word_doc_count) - x else: word_topic_freq[topic_id] = 1 count = sum(word_topic_freq.itervalues()) self.db.set((word, [(k, (v + x) / (count + x * len(topic_count))) for k, v in word_topic_freq.iteritems()]))