def mutual_information_similarity(file_name): """ Calculates MI between all pairs of short_genre based on their word's MI. Prints to file the similarity :return: """ from sklearn.metrics.pairwise import cosine_similarity as cos_sim import math SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score")) # a type # fetch all short genres mi_coll = MutualInformation() # all possible pairs of genre with no repeat genres = [] # calculate cosine similarity b/w pairs dv = DictVectorizer() def extract_bow_add_to_genres(genre, bow): if genre not in genres: genres.append(genre) new_bow = {} for k in bow.keys(): curr = bow[k] new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr new_bow == 0 and print("Eliminated element") return new_bow bow_matrix = dv.fit_transform( extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable() ) print("Done with making vector") # sort the pairs by the cosine similarity score similarity_matrix = cos_sim(bow_matrix) print("Done with similarity calculation") sorted_list = [] # sort the similarity scores for x, y in itertools.combinations(range(0, len(genres)), 2): sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y])) # sort! sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True) print("printing file") with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file: for l in sorted_list: file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
def get_all_mi_and_plot(reversed=False): """ Grab all mutual information data from the database collection MutualInformation and plot them with matlibplot :return: None! """ # graphics.plot_save_all_genre() mi = MutualInformation() for mi_obj in mi.iterable(): genre = mi_obj["short_genre"] bow_mi = mi_obj["bow"] filtered_bow_mit = {} for k, v in bow_mi.items(): if not k.isdigit(): filtered_bow_mit[k] = v plt = graphics.plot_word_frequency(genre, filtered_bow_mit, reversed=reversed) graphics.save_fig("graphs/{}.pdf".format(("reversed_" if reversed else "") + genre.replace("/", "_")), plt) print(genre)