def aggregate_by_period(dataframe): before_convention = Counter() convention = Counter() after_convention = Counter() for i, time in enumerate(dataframe['Full Date']): # Convert time to a string to do the string equality analysis to determine which period the row belongs to time = str(time) if (time >= "1792-6-10") and (time <= "1792-8-10"): before_convention = before_convention + dataframe['ngrams'].iloc[i] if (time >= "1792-9-20") and (time < "1793-6-2"): convention = convention + dataframe['ngrams'].iloc[i] if (time >= "1793-6-2") and (time <= "1793-8-2"): after_convention = after_convention + dataframe['ngrams'].iloc[i] before_convention_tfidf = compute_tfidf(before_convention, num_speeches, doc_freq) convention_tfidf = compute_tfidf(convention, num_speeches, doc_freq) after_convention_tfidf = compute_tfidf(after_convention, num_speeches, doc_freq) before_convention_df = pd.DataFrame.from_dict(before_convention_tfidf, orient="index") convention_df = pd.DataFrame.from_dict(convention_tfidf, orient="index") after_convention_df = pd.DataFrame.from_dict(after_convention_tfidf, orient="index") #period_df = pd.DataFrame([before_convention, convention, after_convention]) #write_to_excel(period_df, 'periods.xlsx') period_df = [ before_convention_tfidf, convention_tfidf, after_convention_tfidf ] return period_df
def aggregate_by_period(dataframe): before_convention = Counter() convention = Counter() after_convention = Counter() for key in dataframe: time = key if (time >= "1792-06-10") and (time <= "1792-08-10"): before_convention = before_convention + dataframe[time] if (time >= "1792-09-20") and (time < "1793-06-02"): convention = convention + dataframe[time] if (time >= "1793-06-02") and (time <= "1793-08-02"): after_convention = after_convention + dataframe[time] before_convention_tfidf = compute_tfidf(before_convention, num_speeches, doc_freq) convention_tfidf = compute_tfidf(convention, num_speeches, doc_freq) after_convention_tfidf = compute_tfidf(after_convention, num_speeches, doc_freq) before_convention_df = pd.DataFrame.from_dict(before_convention_tfidf, orient="index") convention_df = pd.DataFrame.from_dict(convention_tfidf, orient="index") after_convention_df = pd.DataFrame.from_dict(after_convention_tfidf, orient="index") period_df = [ before_convention_tfidf, convention_tfidf, after_convention_tfidf ] return period_df
def distance_analysis(): by_speaker = pickle.load(open("byspeaker.pickle", "rb")) speakernumwords = pickle.load(open("speakernumwords.pickle", "rb")) speaker_tfidf = {} for speaker in by_speaker: counter = by_speaker[speaker] counter = convert_keys_to_string(counter) # Tried doing v>=3 but there are some speakers who do not say any bigrams more than 3 times freq = {k:v for k,v in counter.items() if (v >= 2)} tfidf = compute_tfidf(freq, num_speeches, doc_freq) speaker_tfidf[speaker] = tfidf robespierre = speaker_tfidf["Maximilien-Francois-Marie-Isidore-Joseph de Robespierre"] speaker_dist = {} for speaker in speaker_tfidf: if str(speaker) != "Maximilien-François-Marie-Isidore-Joseph de Robespierre": print speaker_tfidf[speaker] dist = 1-cosine_similarity(robespierre, speaker_tfidf[speaker]) speaker_dist[speaker] = dist w = csv.writer(open("dist_to_robespierre_withlimit.csv", "w")) for key, val in speaker_dist.items(): w.writerow([key,val])
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq): # Computes the tfidf scores within each group gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) store_to_pickle(gir_tfidf, "gir_tfidf.pickle") store_to_pickle(mont_tfidf, "mont_tfidf.pickle") # Stores the tf_idf vectors in Excel df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx') # Combines the tfidf vectors of both parties into one file df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx') # Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of # speakers in each group that use that bigram # Can change the name of these dataframes to illuminate what the restrictions are Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle") store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_restricted.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
def create_tfidf_vectors(dataframe): speeches = dataframe['concat_speeches'].tolist() ngrams = [] for unit in speeches: ngrams.append(compute_ngrams(unit, 2)) ngrams_to_add = pd.Series(ngrams) dataframe['ngrams'] = ngrams_to_add.values tfidf = [] for element in ngrams: tfidf.append(compute_tfidf(element, num_speeches, doc_freq)) tfidf_to_add = pd.Series(tfidf) dataframe['tfidf'] = tfidf_to_add.values return dataframe
def create_tfidf_vectors(dataframe): tfidf = {} for element in dataframe: tfidf[element] = compute_tfidf(dataframe[element], num_speeches, doc_freq) return tfidf
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) speaker_num_speeches = pickle.load( open("speaker_num_speeches.pickle", "rb")) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) # Consider dividing by number of speeches, to normalize # Maintain num of speeches per group and number of chars per group gir = pickle.load(open("Girondins.pickle", "rb")) mont = pickle.load(open("Montagnards.pickle", "rb")) file = open('gir_speeches_noplein.txt', 'r') gir_num_speeches = int(file.read()) file = open('mont_speeches_noplein.txt', 'r') mont_num_speeches = int(file.read()) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] print element print party print type(dataframe[element]) if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': mont = mont - dataframe[element] # Normalizing by number of speeches #gir_normalized = normalize_by_speeches(gir, gir_num_speeches) #mont_normalized = normalize_by_speeches(mont, mont_num_speeches) gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) # Resets the Gir and Mont vectors to their unnormalized version #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq)) #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq)) w = csv.writer(open("gir_mont_diff.csv", "w")) for key, val in gir_mont_diff.items(): w.writerow([key, val]) # Normalizing the speaker data as well #speaker_speeches = speaker_num_speeches[element] #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches) speaker_dict = dataframe[element] tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: #Normalized gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) #Unnormalized #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare)) #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return mont_dist = pd.Series(mont_dist) gir_dist = pd.Series(gir_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff' ] return comp_df
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = {} bigrams_to_speakers = {} bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len( raw_speeches[identity]) else: speaker_char_count[speaker_name] = len( raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram #speech = speech + indv_speech_bigram # # Stores the bigram Counter object for each individual speaker # """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0)""" # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient="index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') pickle_filename = "bigrams_to_speakers.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speakers, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "gir_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(gir_docs, handle, protocol=0) pickle_filename = "mont_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(mont_docs, handle, protocol=0) # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) bigram_num_speakers = [] bigram_num_speeches = [] bigram_total_freq = [] bg_speeches = {} bigrams = [] speeches = [] speakers = [] for bigram in bigrams_to_speeches: if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10): bigram_num_speakers.append(len(bigrams_to_speakers[bigram])) bigram_num_speeches.append(len(bigrams_to_speeches[bigram])) bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram]) bigrams.append(str(bigram)) speeches.append(str(bigrams_to_speeches[bigram])) speakers.append(str(bigrams_to_speakers[bigram])) bg_num_speakers = pd.DataFrame(bigram_num_speakers, columns=['Num Speakers']) bg_num_speeches = pd.DataFrame(bigram_num_speeches, columns=['Num Speeches']) bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count']) bgs = pd.DataFrame(bigrams, columns=["Bigram"]) speech = pd.DataFrame(speeches, columns=["Speechids"]) speaker = pd.DataFrame(speakers, columns=["Speakers"]) bigram_info = pd.DataFrame() bigram_info = pd.concat([ bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq ], axis=1) writer = pd.ExcelWriter("bigram_info.xlsx") bigram_info.to_excel(writer, 'Sheet1') writer.save() w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w")) for key, val in bigrams_to_speeches.items(): w.writerow([key, val]) bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(), key=lambda x: len(x[1]), reverse=True) w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w")) for item in bigrams_to_speakers_noplein_sorted: w.writerow([item[0], item[1]]) # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors # num_speeches = 4479 # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb')) with open('gir_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % mont_num_speeches) print num_speeches with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withlimit.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} # with open("Girondins_withlimit.pickle", 'wb') as handle: # pickle.dump(Girondins, handle, protocol = 0) # with open("Montagnards_withlimit.pickle", 'wb') as handle: # pickle.dump(Montagnards, handle, protocol = 0) # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # """with open("gir_tfidf.pickle", 'wb') as handle: # pickle.dump(gir_tfidf, handle, protocol = 0) # with open("mont_tfidf.pickle", 'wb') as handle: # pickle.dump(mont_tfidf, handle, protocol = 0)""" # # Computes the distance between the tf_idf vectors # #compute_distance(gir_tfidf, mont_tfidf) # # Stores the tf_idf vectors # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") # #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx') # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") # #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx') # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) # df_tfidf_combined = df_tfidf_combined.transpose() # df_tfidf_combined.columns = ["Girondins", "Montagnards"] # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches # print gir_docs Girondins = {k: v for k, v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # # Normalizes the vectors and computes the distance between them # #normalized = normalize_dicts(Girondins, Montagnards) # #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
def data_clean(iteration, train_columns, bigram_speeches, unigram_speeches, bigram_freq, unigram_freq, bigram_doc_freq, unigram_doc_freq, num_speeches): classification = [] data_set = [] speeches = [] speakers = [] speechid_to_speaker = pickle.load( open("speechid_to_speaker_store.pickle", "rb")) speakers_to_analyze = pickle.load( open("speakers_to_analyze_store.pickle", "rb")) ### Should I do this once for all the data and then split it into test and train? That way all the data is based on the same bigrams. Or is that ### bad because then the training data is connected to the test data via the tfidf calculations? for speechid in bigram_speeches: speaker = speechid_to_speaker[speechid] #create a vector of speechids in correct order to reverse engineer and check which speeches were/were not correctly classified speeches.append(speechid) speakers.append(speaker) if speakers_to_analyze.loc[speaker, "Party"] == "Girondins": classification.append(0) else: classification.append(1) # Feature selection taking place here # Analysis accounts for bigrams and unigrams if iteration == "train": # Restricting features according to how many times they appear bigram_input = { k: v for k, v in bigram_speeches[speechid].items() if (bigram_freq[k] >= 20) } unigram_input = { k: v for k, v in unigram_speeches[speechid].items() if (unigram_freq[k] >= 62) } bigram_scores = compute_tfidf(bigram_input, num_speeches, bigram_doc_freq) unigram_scores = compute_tfidf(unigram_input, num_speeches, unigram_doc_freq) else: bigram_input = { k: v for k, v in bigram_speeches[speechid].items() if (k in train_columns) } unigram_input = { k: v for k, v in unigram_speeches[speechid].items() if (k in train_columns) } bigram_scores = compute_tfidf(bigram_input, num_speeches, bigram_doc_freq) unigram_scores = compute_tfidf(unigram_input, num_speeches, unigram_doc_freq) merge_scores = bigram_scores.copy() merge_scores.update(unigram_scores) data_set.append(merge_scores) # Remove data from memory to clear space for other computations speechid_to_speaker = None speakers_to_analyze = None bigram_speeches = None unigram_speeches = None bigram_input = None unigram_input = None bigram_scores = None unigram_scores = None data = pd.DataFrame(data_set) data_set = None data = data.fillna(0) return ([data, classification, speeches, speakers])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 plein_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02"): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes speaker_name = speechid_to_speaker[identity] party = "" if speaker_name in speakers_to_consider: party = speakers_to_analyze.loc[speaker_name, "Party"] else: party = "Plein" augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram elif party == "Montagnards": mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one # or the other else: plein_num_speeches += 1 plein_docs = check_num_speakers(indv_speech_bigram, speaker_name, plein_docs) try: Plein = Plein + indv_speech_bigram except NameError: Plein = indv_speech_bigram #speech = speech + indv_speech_bigram # Stores the bigram Counter object for each individual speaker """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol = 0)""" """# Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')""" # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches print num_speeches with open('speaker_num_speeches_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withplein.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withplein.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_withplein.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_withplein.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("Girondins_withplein.pickle", 'wb') as handle: pickle.dump(Girondins, handle, protocol=0) with open("Montagnards_withplein.pickle", 'wb') as handle: pickle.dump(Montagnards, handle, protocol=0) with open("Plein.pickle", 'wb') as handle: pickle.dump(Plein, handle, protocol=0) gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq) """with open("gir_tfidf.pickle", 'wb') as handle: pickle.dump(gir_tfidf, handle, protocol = 0) with open("mont_tfidf.pickle", 'wb') as handle: pickle.dump(mont_tfidf, handle, protocol = 0)""" # Computes the distance between the tf_idf vectors #compute_distance(gir_tfidf, mont_tfidf) # Stores the tf_idf vectors df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index") #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx') df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches Girondins = {k: v for k, v in Girondins.items() if (v >= 3)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 3)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx") # Normalizes the vectors and computes the distance between them #normalized = normalize_dicts(Girondins, Montagnards) #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency.xlsx')
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) if speaker_name in speaker_ngrams: speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram else: speaker_ngrams[speaker_name] = indv_speech_bigram """ if speaker_name in chronology: pairing = chronology[speaker_name] for bigram in indv_speech_bigram: if bigram in pairing: pairing[bigram].append([identity, indv_speech_bigram[bigram]]) else: pairing[bigram] = [identity, indv_speech_bigram[bigram]] else: chronology[speaker_name] = {} pairing = chronology[speaker_name] for bigram in indv_speech_bigram: pairing[bigram] = [] # stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram pairing[bigram] = [identity, indv_speech_bigram[bigram]]""" ## Need tf-idf vectors for gir and mont ## Need the doc_freq for the previous calcuations ## compute tf-idf for individual speakers ## compute cosine distance based on those vectors (dot product over length of vectors) ## compute cosine similarity between the difference between the two group vectors (subtract from each other) ## A - B, if positive more like A, if negative more like B ## create tf vector for each speech and store that so can just add ## Separately store single idf vector ######### gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) doc_freq_dict = convert_keys_to_string(doc_freq) gir_mont_diff = compute_difference(gir_dict, mont_dict) #gir_dict = gir_tfidf #print gir_dict #mont_dict = mont_tfidf for speaker in speaker_ngrams: speaker_dict = convert_keys_to_string(speaker_ngrams[speaker]) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict) gir_dist = cosine_similarity(gir_dict, to_compare) mont_dist = cosine_similarity(mont_dict, to_compare) # Need to actually compute the distance gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare) speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist] """ #speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']} speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot']) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq) gir_dist = cosine_similarity(gir_dict, to_compare) df = pd.DataFrame([to_compare, gir_dict]) df = df.transpose() write_to_excel(df, "Francois Chabot Test.xlsx")""" """for speaker in speaker_ngrams: #to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)} to_compare = speaker_ngrams[speaker] gir_dict = gir_tfidf mont_dict = mont_tfidf gir_normalized = normalize_dicts(to_compare, gir_dict) gir_dist = compute_distance(gir_normalized[0], gir_normalized[1]) to_compare = speaker_ngrams[speaker] mont_normalized = normalize_dicts(to_compare, mont_dict) mont_dist = compute_distance(mont_normalized[0], mont_normalized[1]) speaker_distances[speaker] = [gir_dist, mont_dist]""" pickle_filename_3 = "speaker_ngrams.pickle" with open(pickle_filename_3, 'wb') as handle: pickle.dump(speaker_ngrams, handle, protocol = 0) df = pd.DataFrame.from_dict(speaker_distances) df = df.transpose() df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"] filename = "freq_dist_map.xlsx" writer = pd.ExcelWriter(filename) df.to_excel(writer, 'Sheet1') writer.save() pickle_filename = "freq_dist.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speaker_distances, handle, protocol = 0) """df2 = pd.DataFrame.from_dict(chronology)
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict, gir_mont_diff): period_vector = [] if (period == 'aggregation') or (period == 'speaker'): period_vector = list(dataframe.keys()) period_vector = pd.Series(period_vector) """period_vector = pd.Series(period_vector) tfidf_scores = dataframe['tfidf'].tolist()""" else: periods = ["Before convention", "Convention", "After convention"] period_vector = pd.Series(periods) # This assumes that tfidf_scores for the periods is a list not a pandas dataframe gir_dist = [] mont_dist = [] plein_dist = [] gir_mont_diff_dist = [] # This for loop is contingent on tfidf_scores being a list for element in dataframe: """print type(element) print type(dataframe[element]) to_compare = dataframe[element]""" print element if period == 'speaker': #gir = pickle.load(open("Girondins.pickle", "rb")) #mont = pickle.load(open("Montagnards.pickle", "rb")) gir = pickle.load(open("Girondins_withplein.pickle", "rb")) mont = pickle.load(open("Montagnards_withplein.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod.xlsx") party = speakers_to_analyze.loc[element, "Party"] if party == 'Girondins': gir = gir - dataframe[element] if party == 'Montagnards': print "here" mont = mont - dataframe[element] gir_dict = convert_keys_to_string( compute_tfidf(gir, num_speeches, doc_freq)) mont_dict = convert_keys_to_string( compute_tfidf(mont, num_speeches, doc_freq)) gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict) tfidf_speaker = compute_tfidf(dataframe[element], num_speeches, doc_freq) to_compare = convert_keys_to_string(tfidf_speaker) elif period == 'aggregation': to_compare = convert_keys_to_string(dataframe[element]) else: to_compare = convert_keys_to_string(element) # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance # This was particularly relevant as there was a speaker with tfidf_scores of length 0 if len(to_compare) > 0: gir_dist.append(1 - cosine_similarity(gir_dict, to_compare)) mont_dist.append(1 - cosine_similarity(mont_dict, to_compare)) plein_dist.append(1 - cosine_similarity(plein_dict, to_compare)) gir_mont_diff_dist.append( cosine_similarity(gir_mont_diff, to_compare)) else: gir_dist.append(1) mont_dist.append(1) plein_dist.append(1) gir_mont_diff_dist.append(0) # Merges the distance lists and creates a comprehensive dataframe to return gir_dist = pd.Series(gir_dist) mont_dist = pd.Series(mont_dist) plein_dist = pd.Series(plein_dist) gir_mont_diff_dist = pd.Series(gir_mont_diff_dist) comp_df = pd.DataFrame( [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist]) comp_df = comp_df.transpose() comp_df.columns = [ period, 'distance to gir', 'distance to mont', 'distance to diff', 'distance to plein' ] return comp_df