def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff): period_vector = [] if period == 'month': period_vector = dataframe['Year-Month'].tolist() period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist() elif period == 'date': period_vector = dataframe['Date'].tolist() period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist() elif period == 'speaker': period_vector = dataframe['Speaker'].tolist() period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist() else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe tfidf_scores = dataframe gir_dist = [] mont_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for counter in tfidf_scores: to_compare = convert_keys_to_string(counter) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return gir_dist = pd.Series(gir_dist) mont_dist = pd.Series(mont_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff' ] return comp_df
def distance_analysis(): by_speaker = pickle.load(open("byspeaker.pickle", "rb")) speakernumwords = pickle.load(open("speakernumwords.pickle", "rb")) speaker_tfidf = {} for speaker in by_speaker: counter = by_speaker[speaker] counter = convert_keys_to_string(counter) # Tried doing v>=3 but there are some speakers who do not say any bigrams more than 3 times freq = {k:v for k,v in counter.items() if (v >= 2)} tfidf = compute_tfidf(freq, num_speeches, doc_freq) speaker_tfidf[speaker] = tfidf robespierre = speaker_tfidf["Maximilien-Francois-Marie-Isidore-Joseph de Robespierre"] speaker_dist = {} for speaker in speaker_tfidf: if str(speaker) != "Maximilien-François-Marie-Isidore-Joseph de Robespierre": print speaker_tfidf[speaker] dist = 1-cosine_similarity(robespierre, speaker_tfidf[speaker]) speaker_dist[speaker] = dist w = csv.writer(open("dist_to_robespierre_withlimit.csv", "w")) for key, val in speaker_dist.items(): w.writerow([key,val])
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) speaker_num_speeches = pickle.load( open("speaker_num_speeches.pickle", "rb")) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) # Consider dividing by number of speeches, to normalize # Maintain num of speeches per group and number of chars per group gir = pickle.load(open("Girondins.pickle", "rb")) mont = pickle.load(open("Montagnards.pickle", "rb")) file = open('gir_speeches_noplein.txt', 'r') gir_num_speeches = int(file.read()) file = open('mont_speeches_noplein.txt', 'r') mont_num_speeches = int(file.read()) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] print element print party print type(dataframe[element]) if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': mont = mont - dataframe[element] # Normalizing by number of speeches #gir_normalized = normalize_by_speeches(gir, gir_num_speeches) #mont_normalized = normalize_by_speeches(mont, mont_num_speeches) gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) # Resets the Gir and Mont vectors to their unnormalized version #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq)) #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq)) w = csv.writer(open("gir_mont_diff.csv", "w")) for key, val in gir_mont_diff.items(): w.writerow([key, val]) # Normalizing the speaker data as well #speaker_speeches = speaker_num_speeches[element] #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches) speaker_dict = dataframe[element] tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: #Normalized gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) #Unnormalized #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare)) #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return mont_dist = pd.Series(mont_dist) gir_dist = pd.Series(gir_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff' ] return comp_df
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] plein_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" print element if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) gir = pickle.load(open("Girondins_withplein.pickle", "rb")) mont = pickle.load(open("Montagnards_withplein.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': print "here" mont = mont - dataframe[element] gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) tfidf_speaker = compute_tfidf(dataframe[element], num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) plein_dist.append(1 - cosine_similarity(plein_dict, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) plein_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return gir_dist = pd.Series(gir_dist) mont_dist = pd.Series(mont_dist) plein_dist = pd.Series(plein_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff', 'distance to plein' ] return comp_df