def _calc_word_genre_counts(training_coll_cls): assert training_coll_cls == TrainSetBow db = connection.get_db(MUTUAL_INFO_DB) genre_word_update_template = "bow.{}" for count, training_obj in enumerate(training_coll_cls.objects): if count % 1000 == 0: print("count is at {}".format(count)) single_bow = training_obj.bow word_count_update = {} # update word count for word, c in single_bow.items(): if len(word) < 100: WordCount_training.objects(word=word).update(upsert=True, inc__count=c) word_count_update[genre_word_update_template.format(word)] = c word_count_update["count"] = 1 # genre count db.GenreCount_training.update_one({"genre": training_obj.short_genre}, {"$inc": word_count_update}, upsert=True)
def _caculate_top_X_of_each_genre(top_x=1000): """ Get the bow of each genre from GenreCount_training. Use mutual information calculation: P(f|c)P(c)log(N*P(f|c)/f) Since the comparison is intraclass, we can eliminate P(c), giving P(f|c)log(N*P(f|c)/f) Eventually, eliminating more terms, we get f_c * log(f_c * N / f), where f_c is the number of count of word f in class c To get relative measure of each word for each class. Top X of each genre is then chosen and stored in :param: top_x, top X word chosen from each category, default is 200 :return: """ print("Removing top word genre") TopWordGenre.objects().delete() print("Removing mutualinformationgenre") MutualInformationGenres.objects().delete() total_word_count = WordCount_training.objects().count() for c, genre_count_obj in enumerate(GenreCount_training.objects): print("Current at {}".format(genre_count_obj.genre)) bow = genre_count_obj.bow mi_genre_dict = {} for count, (word, word_freq_genre) in enumerate(bow.items()): if count % 10000 == 0: print("Count is at {}".format(count)) # calculate mu for each item word_count = WordCount_training.objects.get(word=word).count mi_genre_dict[word] = word_freq_genre * math.log(total_word_count * word_freq_genre / word_count) # sort and get top x sorted_list = itertools.islice( sorted(mi_genre_dict.items(), key=operator.itemgetter(1), reverse=True), 0, top_x ) # store the top mu TopWordGenre(genre=genre_count_obj.genre, bow=dict(sorted_list)).save() # just save the whole mu MutualInformationGenres(genre=genre_count_obj.genre, bow=mi_genre_dict).save()