예제 #1
0
def mutual_information_similarity(file_name):
    """
    Calculates MI between all pairs of short_genre based on their word's MI.

    Prints to file the similarity

    :return:
    """
    from sklearn.metrics.pairwise import cosine_similarity as cos_sim
    import math

    SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score"))  # a type

    # fetch all short genres
    mi_coll = MutualInformation()
    # all possible pairs of genre with no repeat
    genres = []

    # calculate cosine similarity b/w pairs
    dv = DictVectorizer()

    def extract_bow_add_to_genres(genre, bow):
        if genre not in genres:
            genres.append(genre)

        new_bow = {}

        for k in bow.keys():

            curr = bow[k]
            new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr

            new_bow == 0 and print("Eliminated element")

        return new_bow

    bow_matrix = dv.fit_transform(
        extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable()
    )

    print("Done with making vector")
    # sort the pairs by the cosine similarity score
    similarity_matrix = cos_sim(bow_matrix)

    print("Done with similarity calculation")
    sorted_list = []
    # sort the similarity scores
    for x, y in itertools.combinations(range(0, len(genres)), 2):
        sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y]))
    # sort!
    sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)

    print("printing file")
    with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file:
        for l in sorted_list:
            file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
예제 #2
0
def get_all_mi_and_plot(reversed=False):
    """
    Grab all mutual information data from the database collection MutualInformation and plot them with matlibplot

    :return: None!
    """
    # graphics.plot_save_all_genre()
    mi = MutualInformation()

    for mi_obj in mi.iterable():

        genre = mi_obj["short_genre"]
        bow_mi = mi_obj["bow"]

        filtered_bow_mit = {}
        for k, v in bow_mi.items():
            if not k.isdigit():
                filtered_bow_mit[k] = v

        plt = graphics.plot_word_frequency(genre, filtered_bow_mit, reversed=reversed)
        graphics.save_fig("graphs/{}.pdf".format(("reversed_" if reversed else "") + genre.replace("/", "_")), plt)

        print(genre)