def firststep(): byspeaker = {} speakerdict = {} byspeaker_allspeakers = {} speakerdict_allspeakers = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index") # dataframe.columns = ['Speeches'] speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) # file = open('num_speeches.txt', 'r') # num_speeches = int(file.read()) # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx") for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] print speaker if speaker in byspeaker_allspeakers: byspeaker_allspeakers[ speaker] = byspeaker_allspeakers[speaker] + speech_bigrams else: byspeaker_allspeakers[speaker] = speech_bigrams speech_bigrams = None with open("byspeaker_allspeakers.pickle", "wb") as handle: pickle.dump(byspeaker_allspeakers, handle, protocol=0) w = csv.writer(open("byspeaker_allspeakers.csv", "w")) for key, val in byspeaker.items(): w.writerow([key, val]) """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
def aggregate_by_speaker(): byspeaker = {} speakerdict = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speaker_num_words = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): num_words = len(raw_speeches[speechid].split()) speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] if speaker in speaker_num_words: speaker_num_words[speaker] += num_words else: speaker_num_words[speaker] = num_words if speaker in speakers_to_consider: if speaker in byspeaker: byspeaker[speaker] = byspeaker[speaker] + speech_bigrams else: byspeaker[speaker] = speech_bigrams speech_bigrams = None write_to_csv(byspeaker) store_to_pickle(byspeaker) write_to_csv(speaker_num_words) store_to_pickle(speaker_num_words)
speaker_name == speechid_to_speaker[identity]): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: if bigram in speaker_bigram_frequencies: #speechid_frequencies = speaker_bigram_frequencies[bigram] #speechid_frequencies[speechid] = indv_speech_bigram[bigram] speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] else: speaker_bigram_frequencies[bigram] = {} speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] filename_pickle = "" + speaker_name + "bigram_frequencies.pickle" with open(filename_pickle, 'wb') as handle: pickle.dump(speaker_bigram_frequencies, handle, protocol=0) filename_csv = "" + speaker_name + "bigram_frequencies.csv" w = csv.writer(open(filename_csv, "w")) for key, val in speaker_bigram_frequencies.items(): w.writerow([key, val]) if __name__ == '__main__': import sys raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker)
def track_murmures_applaudissements(raw_speeches, speechid_to_speaker): speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speakers_to_consider = [] for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) murmures = [] applaudissements = [] Girondins_murmures = 0 Montagnards_murmures = 0 Girondins_applaudissements = 0 Montagnards_applaudissements = 0 murmures_by_date = {} applaudissements_by_date = {} total_murmures = 0 total_applaudissements = 0 murmures_speakers = {} applaudissements_speakers = {} for speechid, speech in raw_speeches.items(): speaker_name = speechid_to_speaker[speechid] if speaker_name in speakers_to_consider: date = speechid[0:9] party = speakers_to_analyze.loc[speaker_name, "Party"] if "murmure" in speech: total_murmures += 1 murmures.append(speechid) if party == "Girondins": Girondins_murmures += 1 else: Montagnards_murmures += 1 if date in murmures_by_date: murmures_by_date[date] += 1 else: murmures_by_date[date] = 0 if speaker_name in murmures_speakers: murmures_speakers[speaker_name] += 1 else: murmures_speakers[speaker_name] = 0 if "applaudissement" in speech: total_applaudissements += 1 applaudissements.append(speechid) if party == "Girondins": Girondins_applaudissements += 1 else: Montagnards_applaudissements += 1 if date in applaudissements_by_date: applaudissements_by_date[date] += 1 else: applaudissements_by_date[date] = 0 if speaker_name in applaudissements_speakers: applaudissements_speakers[speaker_name] += 1 else: applaudissements_speakers[speaker_name] = 0 else: if "murmure" in speech: total_murmures += 1 if "applaudissement" in speech: total_applaudissements += 1 with open('gir_murmures.txt', 'w') as f: f.write('%d' % Girondins_murmures) with open('mont_murmures.txt', 'w') as f: f.write('%d' % Montagnards_murmures) print Montagnards_murmures + Girondins_murmures with open('total_murmures.txt', 'w') as f: f.write('%d' % total_murmures) with open('total_applaudissements.txt', 'w') as f: f.write('%d' % total_applaudissements) with open('gir_applaudissements.txt', 'w') as f: f.write('%d' % Girondins_applaudissements) with open('mont_applaudissements.txt', 'w') as f: f.write('%d' % Montagnards_applaudissements) print Montagnards_applaudissements + Girondins_applaudissements with open('murmures_by_date.pickle', 'wb') as handle: pickle.dump(murmures_by_date, handle, protocol=0) with open('applaudissements_by_date.pickle', 'wb') as handle: pickle.dump(applaudissements_by_date, handle, protocol=0) w = csv.writer(open("murmures_by_date.csv", "w")) for key, val in murmures_by_date.items(): w.writerow([key, val]) w = csv.writer(open("applaudissements_by_date.csv", "w")) for key, val in applaudissements_by_date.items(): w.writerow([key, val]) w = csv.writer(open("murmures_speakers.csv", "w")) for key, val in murmures_speakers.items(): w.writerow([key, val]) w = csv.writer(open("applaudissements_speakers.csv", "w")) for key, val in applaudissements_speakers.items(): w.writerow([key, val])
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) speaker_num_speeches = pickle.load( open("speaker_num_speeches.pickle", "rb")) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) # Consider dividing by number of speeches, to normalize # Maintain num of speeches per group and number of chars per group gir = pickle.load(open("Girondins.pickle", "rb")) mont = pickle.load(open("Montagnards.pickle", "rb")) file = open('gir_speeches_noplein.txt', 'r') gir_num_speeches = int(file.read()) file = open('mont_speeches_noplein.txt', 'r') mont_num_speeches = int(file.read()) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] print element print party print type(dataframe[element]) if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': mont = mont - dataframe[element] # Normalizing by number of speeches #gir_normalized = normalize_by_speeches(gir, gir_num_speeches) #mont_normalized = normalize_by_speeches(mont, mont_num_speeches) gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) # Resets the Gir and Mont vectors to their unnormalized version #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq)) #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq)) w = csv.writer(open("gir_mont_diff.csv", "w")) for key, val in gir_mont_diff.items(): w.writerow([key, val]) # Normalizing the speaker data as well #speaker_speeches = speaker_num_speeches[element] #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches) speaker_dict = dataframe[element] tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: #Normalized gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) #Unnormalized #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare)) #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return mont_dist = pd.Series(mont_dist) gir_dist = pd.Series(gir_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff' ] return comp_df
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] plein_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" print element if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) gir = pickle.load(open("Girondins_withplein.pickle", "rb")) mont = pickle.load(open("Montagnards_withplein.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': print "here" mont = mont - dataframe[element] gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) tfidf_speaker = compute_tfidf(dataframe[element], num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) plein_dist.append(1 - cosine_similarity(plein_dict, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) plein_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return gir_dist = pd.Series(gir_dist) mont_dist = pd.Series(mont_dist) plein_dist = pd.Series(plein_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff', 'distance to plein' ] return comp_df