Пример #1
0
def _calc_word_genre_counts(training_coll_cls):
    assert training_coll_cls == TrainSetBow

    db = connection.get_db(MUTUAL_INFO_DB)

    genre_word_update_template = "bow.{}"
    for count, training_obj in enumerate(training_coll_cls.objects):
        if count % 1000 == 0:
            print("count is at {}".format(count))

        single_bow = training_obj.bow

        word_count_update = {}
        # update word count
        for word, c in single_bow.items():
            if len(word) < 100:
                WordCount_training.objects(word=word).update(upsert=True, inc__count=c)
                word_count_update[genre_word_update_template.format(word)] = c

        word_count_update["count"] = 1
        # genre count
        db.GenreCount_training.update_one({"genre": training_obj.short_genre}, {"$inc": word_count_update}, upsert=True)
Пример #2
0
def _caculate_top_X_of_each_genre(top_x=1000):
    """
    Get the bow of each genre from GenreCount_training. Use mutual information calculation:
    P(f|c)P(c)log(N*P(f|c)/f)

    Since the comparison is intraclass, we can eliminate P(c), giving P(f|c)log(N*P(f|c)/f)

    Eventually, eliminating more terms, we get f_c * log(f_c * N / f), where f_c is the number of count of word f
        in class c

    To get relative measure of each word for each class.

    Top X of each genre is then chosen and stored in

    :param: top_x, top X word chosen from each category, default is 200
    :return:
    """
    print("Removing top word genre")
    TopWordGenre.objects().delete()
    print("Removing mutualinformationgenre")
    MutualInformationGenres.objects().delete()

    total_word_count = WordCount_training.objects().count()
    for c, genre_count_obj in enumerate(GenreCount_training.objects):
        print("Current at {}".format(genre_count_obj.genre))

        bow = genre_count_obj.bow

        mi_genre_dict = {}
        for count, (word, word_freq_genre) in enumerate(bow.items()):
            if count % 10000 == 0:
                print("Count is at {}".format(count))

            # calculate mu for each item
            word_count = WordCount_training.objects.get(word=word).count

            mi_genre_dict[word] = word_freq_genre * math.log(total_word_count * word_freq_genre / word_count)

        # sort and get top x
        sorted_list = itertools.islice(
            sorted(mi_genre_dict.items(), key=operator.itemgetter(1), reverse=True), 0, top_x
        )

        # store the top mu
        TopWordGenre(genre=genre_count_obj.genre, bow=dict(sorted_list)).save()

        # just save the whole mu
        MutualInformationGenres(genre=genre_count_obj.genre, bow=mi_genre_dict).save()