def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno): id_base = date.replace("/","_") number_of_speeches = 0 presidents = [">le President", "Le President", "Mle President", "President", "le' President", "Le Preesident", "Le Preseident", "Le Presidant", "Le Presideait", "le Presiden", "le President", "Le president", "le president", "Le President,", "Le Presideut", "Le Presidtent", "le Presient", "le Presldent", "le'President"] for talk in daily_soup.find_all('sp'): # Tries to extract the speaker name and edits it for easier pairing with the Excel file try: speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace("M.","").replace("MM ", "").replace("MM. ","").replace("M ", "").replace("de ","").replace("M. ","").replace("M, ","").replace("M- ","").replace("M; ","").replace("M* ","").replace(".","").replace(":","").replace("-", " ") if speaker.endswith(","): speaker = speaker[:-1] if speaker.endswith(", "): speaker = speaker[:-1] if speaker.startswith(' M. '): speaker = speaker[3:] if speaker.startswith(' '): speaker = speaker[1:] if speaker.endswith(' '): speaker = speaker[:-1] except AttributeError: speaker = "" speaker = speaker.lower() # Removes the footnotes speech_id = "" + id_base + "_" + str(number_of_speeches + 1) while talk.find("note"): ftnotes = talk.note.extract() ftnotes = remove_diacritic(ftnotes.get_text()).decode('utf-8') ftnotes = ftnotes.replace("\n","").replace("\r","").replace("\t","").replace(" "," ") footnotes.append([ftnotes, speaker, speech_id, volno]) number_of_speeches += 1
def parseFiles(): wrong_dates = set() files = os.listdir("Docs/") for filename in files: if filename.endswith(".xml"): print(filename) filename = open('Docs/' + filename, "r") volno = re.findall(vol_regex, str(filename))[0] contents = filename.read() soup = BeautifulSoup(contents, 'lxml') # A search for date tags that contain a valid value dates = soup.find_all('date') for date in dates: if date.attrs: coded_date = date['value'] year, month, day = re.findall(date_regex, coded_date)[0] child = date.findChildren() if child: child[0].extract() text_date = date.get_text() text_date = re.sub(r'([ ]{2,})', ' ', text_date) text_date = remove_diacritic(text_date).decode('utf-8') text_date = text_date.lower().replace('\n', '') # Various checks perfomed to see if the textual date matches the encoded date or is valid at all try: text_day, text_month, text_year = re.findall( text_regex, text_date)[0] text_month = text_month.replace(' (sic)', '').replace( '\n', '').replace('\r', '').replace(' ', '') text_date = remove_diacritic(text_month).decode( 'utf-8') text_month = re.sub(r'([ ]{2,})', ' ', text_month) except: wrong_dates.add(coded_date + "; " + str(date.contents) + "; " + str(volno) + "\n") try: month_num = month_to_num[text_month] except: wrong_dates.add(coded_date + "; " + str(date.contents) + "; " + str(volno) + "\n") if (month_num != str(month)): wrong_dates.add(coded_date + "; " + str(date.contents) + "; " + str(volno) + "\n") filename.close() # Write the wrong dates to a file file = open('wrong_dates.txt', 'w') for item in sorted(wrong_dates): file.write(item) file.close()
def read_names(name_file): # pd_list = pd.read_excel("an_names.xls") pd_list = pd.read_excel(name_file) pd_list = pd_list.set_index('Last Name') speakers = pd_list.index.tolist() for speaker in speakers: ind = speakers.index(speaker) speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower() pd_list.index = speakers full_names = [] for full_name in pd_list["Full Name"]: full_names.append(remove_diacritic(full_name).decode('utf-8').lower()) pd_list["Full Name"] = full_names return pd_list
def parseEnc(): # Assumes all xml files are stored in a docs folder in the same directory as the python file files = os.listdir("Encyclopedie/") words = set() for filename in files: if filename.endswith(".tei"): print(filename) filename = open('Encyclopedie/' + filename, "r") contents = filename.read() soup = BeautifulSoup(contents, 'lxml') paragraphs = soup.find_all('p') for para in paragraphs: if para.find("i"): para.i.extract() if para.find("sc"): para.sc.extract() if para.find("note"): para.note.extract() para = para.get_text() para = para.replace("\n", " ").replace("& ", "").replace( "; ", "").replace(".", "").replace(",", "").replace( "?", "").replace("!", "").replace(" ", " ") paragraph = remove_diacritic(para).decode('utf-8') para = para.lower() paragraph = paragraph.split(" ") words = words.union(paragraph) return words
def clean(just_speech): stopwords_from_file = open('FrenchStopwords.txt', 'r') lines = stopwords_from_file.readlines() french_stopwords = [] for line in lines: word = line.split(',') #remove returns and new lines at the end of stop words so the parser catches matches #also remove accents so the entire analysis is done without accents word_to_append = remove_diacritic( unicode(word[0].replace("\n", "").replace("\r", ""), 'utf-8')) french_stopwords.append(word_to_append) just_speech = just_speech.replace("%", " ").replace("\\", " ").replace( "^", " ").replace("=", " ").replace("]", " ").replace("\"", " ").replace( "``", " ").replace("-", " ").replace("[", " ").replace("{", " ").replace( "$", " ").replace("~", " ").replace("-", " ").replace( "}", " ").replace("&", " ").replace(">", " ").replace( "#", " ").replace("/", " ").replace("\`", " ").replace( "'", " ").replace("*", " ").replace( "`", " ").replace(";", " ").replace( "?", " ").replace(",", " ").replace( ":", " ").replace(".", " ").replace( "(", " ").replace(")", " ") clean_text = remove_stopwords(just_speech.lower(), french_stopwords) clean_text = clean_text.replace("marat", " ").replace("accusation", " ") return clean_text
def parseFile(): votes = {} justifications = [] votes_model2 = {} file = open('marat.xml', "r") contents = file.read() contents = re.sub( r'(<p>(?:DÉPARTEMENT|DEPARTEMENT|DÉPARTEMENE)[\s\S]{1,35}<\/p>)', '', contents) soup = BeautifulSoup(contents, 'lxml') # Look at all speaker tags in the XML for talk in soup.find_all('sp'): speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace(".", "") # Find all the text by looking at paragraph tags speech = talk.find_all('p') text = "" full_speech = "" for section in speech: text = text + section.get_text() full_speech = remove_diacritic(text).decode('utf-8') full_speech = full_speech.replace('\n', '').replace('\t', '').replace('\r', '') full_speech = re.sub(r'([ ]{2,})', ' ', full_speech) ### Both of the following if statements are for topic modeling but are used for different approaches to the topic modeling # Restrict to justifications longer than 30 characters for purposes of topic modeling if len(full_speech) > 30: justifications.append(full_speech) votes[speaker] = full_speech if len(full_speech) > 30: votes_model2[speaker] = full_speech # Two topic model functions runTopicModel(justifications) #topicModel(votes_model2) df = pd.DataFrame.from_dict(votes, orient='index') writer = pd.ExcelWriter('Marat_Justifications.xlsx') df.to_excel(writer) writer.save() file.close()
def read_names_file(name_file): pd_list = pd.read_excel(name_file) pd_list = pd_list.set_index('Full Name') speakers = pd_list.index.tolist() for speaker in speakers: ind = speakers.index(speaker) speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower() pd_list.index = speakers return pd_list
def checkErrors(enc_words, french_stopwords): files = os.listdir("AP_ARTFL_vols/") errors_per_vol = {} errors_per_page = {} for filename in files: if filename.endswith(".xml"): filename = open('AP_ARTFL_vols/' + filename, "r") volno = re.findall(vol_regex, str(filename))[0] contents = filename.read() soup = BeautifulSoup(contents, 'lxml') num_errors = 0 pages = re.findall( r'<pb n="[\s0-9]+" facs="[\s\S]{0,300}" \/> [\s\S]{0,10000} <pb', contents) for page in pages: page_num = BeautifulSoup(page, 'lxml').find_all('pb') pageno = volno + "_pg" + page_num[0].get("n") error_per_page = 0 paragraphs = soup.find_all('p') for para in paragraphs: if para.find("note"): para.note.extract() para = para.get_text().lower() para = remove_diacritic(para).decode('utf-8') para = para.replace("'", " ") paragraph = remove_stopwords(para, french_stopwords) # para = para.replace("s'","").replace("l'","").replace("d'","") paragraph = paragraph.replace("\n", " ").replace( ")", "").replace("*", "").replace(":", "").replace( "-", "").replace("_", "").replace("(", "").replace( "& ", "").replace("; ", "").replace(".", "").replace( ",", "").replace("?", "").replace("!", "") paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph) words = paragraph.split(" ") for word in words: if word not in enc_words: print word error_per_page += 1 num_errors += 1 errors_per_page[pageno] = error_per_page errors_per_vol[volno] = num_errors with open("errors_per_vol.pickle", 'wb') as handle: pickle.dump(errors_per_vol, handle, protocol=0) w = csv.writer(open("errors_per_vol.csv", "w")) for key, val in errors_per_vol.items(): w.writerow([key, val]) with open("errors_per_page.pickle", 'wb') as handle: pickle.dump(errors_per_page, handle, protocol=0) w = csv.writer(open("errors_per_page.csv", "w")) for key, val in errors_per_page.items(): w.writerow([key, val])
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) # Store relevant information for each bigram for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) write_to_excel(chronology_speechid, "chronology_speechid.xlsx") write_to_excel(chronology_date, "chronology_date.xlsx") store_to_pickle(chronology_speechid, "chronology_speechid.pickle") store_to_pickle(chronology_date, "chronology_date.pickle")
def firststep(): byspeaker = {} speakerdict = {} byspeaker_allspeakers = {} speakerdict_allspeakers = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index") # dataframe.columns = ['Speeches'] speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) # file = open('num_speeches.txt', 'r') # num_speeches = int(file.read()) # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx") for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] print speaker if speaker in byspeaker_allspeakers: byspeaker_allspeakers[ speaker] = byspeaker_allspeakers[speaker] + speech_bigrams else: byspeaker_allspeakers[speaker] = speech_bigrams speech_bigrams = None with open("byspeaker_allspeakers.pickle", "wb") as handle: pickle.dump(byspeaker_allspeakers, handle, protocol=0) w = csv.writer(open("byspeaker_allspeakers.csv", "w")) for key, val in byspeaker.items(): w.writerow([key, val]) """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
def compute_ngrams(speech, order): stopwords_from_file = open('FrenchStopwords.txt', 'r') lines = stopwords_from_file.readlines() french_stopwords = [] for line in lines: word = line.split(',') #remove returns and new lines at the end of stop words so the parser catches matches #also remove accents so the entire analysis is done without accents word_to_append = remove_diacritic(unicode(word[0].replace("\n","").replace("\r",""), 'utf-8')) french_stopwords.append(word_to_append) speech = speech.replace("%"," ").replace("\\"," ").replace("^", " ").replace("=", " ").replace("]"," ").replace("\""," ").replace("``", " ").replace("-"," ").replace("[", " ").replace("{"," ").replace("$", " ").replace("~"," ").replace("-"," ").replace("}", " ").replace("&"," ").replace(">"," ").replace("#"," ").replace("/"," ").replace("\`"," ").replace("'"," ").replace("*", " ").replace("`", " ").replace(";"," ").replace("?"," ").replace(",", " ").replace(":"," ").replace("."," ").replace("("," ").replace(")"," ") clean_text = remove_stopwords(speech.lower(), french_stopwords) clean_text = clean_text.replace("mm secretaire", " ").replace("assemble nationale", " ").replace("monsieur president", " ").replace("convention nationale", " ").replace("archives parliamentaire", " ").replace("republique francaise", " ").replace("ordre jour", " ").replace("corps legislatif", " ") n_grams = make_ngrams(clean_text, order) speech_ngrams = Counter(n_grams) return(speech_ngrams)
def aggregate_by_speaker(): byspeaker = {} speakerdict = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speaker_num_words = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): num_words = len(raw_speeches[speechid].split()) speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] if speaker in speaker_num_words: speaker_num_words[speaker] += num_words else: speaker_num_words[speaker] = num_words if speaker in speakers_to_consider: if speaker in byspeaker: byspeaker[speaker] = byspeaker[speaker] + speech_bigrams else: byspeaker[speaker] = speech_bigrams speech_bigrams = None write_to_csv(byspeaker) store_to_pickle(byspeaker) write_to_csv(speaker_num_words) store_to_pickle(speaker_num_words)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): speakers_to_consider = [] speaker_bigram_frequencies = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name speaker_bigram_frequencies = {} party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: if bigram in speaker_bigram_frequencies: #speechid_frequencies = speaker_bigram_frequencies[bigram] #speechid_frequencies[speechid] = indv_speech_bigram[bigram] speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] else: speaker_bigram_frequencies[bigram] = {} speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] filename_pickle = "" + speaker_name + "bigram_frequencies.pickle" with open(filename_pickle, 'wb') as handle: pickle.dump(speaker_bigram_frequencies, handle, protocol=0) filename_csv = "" + speaker_name + "bigram_frequencies.csv" w = csv.writer(open(filename_csv, "w")) for key, val in speaker_bigram_frequencies.items(): w.writerow([key, val])
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) # chronology_date = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Date", "Num occurrences"]) # chronology_speechid = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Speechid", "Num occurrences"]) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] # print speaker_name if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) # chronology_speechid = chronology_speechid.append(pd.Series(row_entry_speechid), ignore_index = True) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) # chronology_date = chronology_date.append(pd.Series(row_entry_date), ignore_index = True) # if bigram in chronology: # chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]]) # else: # chronology[bigram] = [] # chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # Create ngram column, speaker name, date, number of occurrences # Create two dataframes, one with date and one with speechid # Include volume number # Do groupby and aggregation methods # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) # write_to = pd.ExcelWriter("chronology_speechid.xlsx") # chronology_speechid.to_excel(write_to, 'Sheet1') # write_to.save() # filename = pd.ExcelWriter("chronology_date.xlsx") # chronology_date.to_excel(write_to, 'Sheet1') # filename.save() pickle_filename_2 = "chronology_speechid.pickle" with open(pickle_filename_2, 'wb') as handle: pickle.dump(chronology_speechid, handle, protocol=0) pickle_filename = "chronology_date.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(chronology_date, handle, protocol=0)
def track_murmures_applaudissements(raw_speeches, speechid_to_speaker): speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speakers_to_consider = [] for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) murmures = [] applaudissements = [] Girondins_murmures = 0 Montagnards_murmures = 0 Girondins_applaudissements = 0 Montagnards_applaudissements = 0 murmures_by_date = {} applaudissements_by_date = {} total_murmures = 0 total_applaudissements = 0 murmures_speakers = {} applaudissements_speakers = {} for speechid, speech in raw_speeches.items(): speaker_name = speechid_to_speaker[speechid] if speaker_name in speakers_to_consider: date = speechid[0:9] party = speakers_to_analyze.loc[speaker_name, "Party"] if "murmure" in speech: total_murmures += 1 murmures.append(speechid) if party == "Girondins": Girondins_murmures += 1 else: Montagnards_murmures += 1 if date in murmures_by_date: murmures_by_date[date] += 1 else: murmures_by_date[date] = 0 if speaker_name in murmures_speakers: murmures_speakers[speaker_name] += 1 else: murmures_speakers[speaker_name] = 0 if "applaudissement" in speech: total_applaudissements += 1 applaudissements.append(speechid) if party == "Girondins": Girondins_applaudissements += 1 else: Montagnards_applaudissements += 1 if date in applaudissements_by_date: applaudissements_by_date[date] += 1 else: applaudissements_by_date[date] = 0 if speaker_name in applaudissements_speakers: applaudissements_speakers[speaker_name] += 1 else: applaudissements_speakers[speaker_name] = 0 else: if "murmure" in speech: total_murmures += 1 if "applaudissement" in speech: total_applaudissements += 1 with open('gir_murmures.txt', 'w') as f: f.write('%d' % Girondins_murmures) with open('mont_murmures.txt', 'w') as f: f.write('%d' % Montagnards_murmures) print Montagnards_murmures + Girondins_murmures with open('total_murmures.txt', 'w') as f: f.write('%d' % total_murmures) with open('total_applaudissements.txt', 'w') as f: f.write('%d' % total_applaudissements) with open('gir_applaudissements.txt', 'w') as f: f.write('%d' % Girondins_applaudissements) with open('mont_applaudissements.txt', 'w') as f: f.write('%d' % Montagnards_applaudissements) print Montagnards_applaudissements + Girondins_applaudissements with open('murmures_by_date.pickle', 'wb') as handle: pickle.dump(murmures_by_date, handle, protocol=0) with open('applaudissements_by_date.pickle', 'wb') as handle: pickle.dump(applaudissements_by_date, handle, protocol=0) w = csv.writer(open("murmures_by_date.csv", "w")) for key, val in murmures_by_date.items(): w.writerow([key, val]) w = csv.writer(open("applaudissements_by_date.csv", "w")) for key, val in applaudissements_by_date.items(): w.writerow([key, val]) w = csv.writer(open("murmures_speakers.csv", "w")) for key, val in murmures_speakers.items(): w.writerow([key, val]) w = csv.writer(open("applaudissements_speakers.csv", "w")) for key, val in applaudissements_speakers.items(): w.writerow([key, val])
for key, val in errors_per_page.items(): if isinstance(key, str): key = unicode(key, "ascii", errors="ignore") w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]]) # Save and output frequency of errors per word per volume store_to_pickle(word_freq_wrong, "word_freq_errors.pickle") w = csv.writer(open("word_freq_errors.csv", "w")) for key, val in word_freq_wrong.items(): w.writerow([key, val]) if __name__ == '__main__': import sys # words = parseEnc() # pickle_filename = "enc_words.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(words, handle, protocol = 0) enc_words = pickle.load(open("enc_words.pickle", "rb")) stopwords_from_file = open('FrenchStopwords.txt', 'r') lines = stopwords_from_file.readlines() french_stopwords = [] for line in lines: word = line.split(',') #remove returns and new lines at the end of stop words so the parser catches matches #also remove accents so the entire analysis is done without accents word_to_append = remove_diacritic( unicode(word[0].replace("\n", "").replace("\r", ""), 'utf-8')) french_stopwords.append(word_to_append) checkErrors(enc_words, french_stopwords)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Initialize various data frames for export to the classification script train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() unigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() # Variable to keep track of a given speaker's number of speeches speech_num = 0 for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] # Only look at speeches within the date frame and that are from the speaker of interest if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Only looking at speeches with substance, so greater than 100 characters if len(raw_speeches[identity]) >= 100: indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1) # Splitting the data into training and test data with 1/4 of each speaker's data in the test set if speech_num%4 != 0: train_number_speeches += 1 for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) augment(train_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(unigram_doc_freq, unigram) augment(train_total_freq_unigram, unigram) train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: augment(test_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(test_total_freq_unigram, unigram) test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram speech_num += 1 # Write all relevant data objects and values to memory to use when running classification store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle") # Set these dataframes to None to conserve memory speakers_to_analyze = None speechid_to_speaker = None raw_speeches = None store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle") store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle") store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle") store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle") store_to_pickle(train_number_speeches, "train_number_speeches.pickle") store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle") store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle") store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle") store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")
def read_names(name_file): pd_list = pd.read_excel(name_file) pd_list = pd_list.set_index('Last Name') speakers = pd_list.index.tolist() for speaker in speakers: ind = speakers.index(speaker) speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower() pd_list.index = speakers full_names = [] for full_name in pd_list["Full Name"]: full_names.append(remove_diacritic(full_name).decode('utf-8').lower()) pd_list["Full Name"] = full_names speakers_to_remove = [] speakers_to_keep = [] # Need to look if dates of speaker are within the timeframe of the Girondins/Montgnards for j, speaker in enumerate(pd_list.index.values): valid_date = False depute_de = pd_list["Depute de"].iloc[j] if depute_de == 1792.0 or depute_de == 1793.0: valid_date = True depute_a = pd_list["Depute a"].iloc[j] if depute_a == 1792.0 or depute_a == 1793.0: valid_date = True if (depute_de <= 1792.0 and depute_a >= 1792.0) or (depute_de <= 1793.0 and depute_a >= 1793.0): valid_date = True depute_de2 = pd_list["Depute puis de 2"].iloc[j] if depute_de2: if depute_de2 == 1792.0 or depute_de2 == 1793.0: valid_date = True depute_a2 = pd_list["Depute a 2"].iloc[j] if depute_a2: if depute_a2 == 1792 or depute_a2 == 1793.0: valid_date = True if depute_de2 and depute_a2: if (depute_de2 <= 1792.0 and depute_a2 >= 1792.0) or (depute_de2 <= 1793.0 and depute_a2 >= 1793.0): valid_date = True depute_de3 = pd_list["Depute puis de 3"].iloc[j] if depute_de3: if depute_de3 == 1792.0 or depute_de3 == 1793.0: valid_date = True depute_a3 = pd_list["Depute a 3"].iloc[j] if depute_a3: if depute_a3 == 1792.0 or depute_a3 == 1793.0: valid_date = True if depute_de3 and depute_a3: if (depute_de3 <= 1792.0 and depute_a3 >= 1792.0) or (depute_de3 <= 1793.0 and depute_a3 >= 1793.0): valid_date = True if valid_date == False: speakers_to_remove.append(j) if valid_date == True: speakers_to_keep.append(j) pd_list = pd_list.iloc[speakers_to_keep] pickle_filename = "dated_names.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(pd_list, handle, protocol = 0) return pd_list
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = {} bigrams_to_speakers = {} bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len( raw_speeches[identity]) else: speaker_char_count[speaker_name] = len( raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram #speech = speech + indv_speech_bigram # # Stores the bigram Counter object for each individual speaker # """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0)""" # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient="index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') pickle_filename = "bigrams_to_speakers.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speakers, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "gir_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(gir_docs, handle, protocol=0) pickle_filename = "mont_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(mont_docs, handle, protocol=0) # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) bigram_num_speakers = [] bigram_num_speeches = [] bigram_total_freq = [] bg_speeches = {} bigrams = [] speeches = [] speakers = [] for bigram in bigrams_to_speeches: if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10): bigram_num_speakers.append(len(bigrams_to_speakers[bigram])) bigram_num_speeches.append(len(bigrams_to_speeches[bigram])) bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram]) bigrams.append(str(bigram)) speeches.append(str(bigrams_to_speeches[bigram])) speakers.append(str(bigrams_to_speakers[bigram])) bg_num_speakers = pd.DataFrame(bigram_num_speakers, columns=['Num Speakers']) bg_num_speeches = pd.DataFrame(bigram_num_speeches, columns=['Num Speeches']) bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count']) bgs = pd.DataFrame(bigrams, columns=["Bigram"]) speech = pd.DataFrame(speeches, columns=["Speechids"]) speaker = pd.DataFrame(speakers, columns=["Speakers"]) bigram_info = pd.DataFrame() bigram_info = pd.concat([ bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq ], axis=1) writer = pd.ExcelWriter("bigram_info.xlsx") bigram_info.to_excel(writer, 'Sheet1') writer.save() w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w")) for key, val in bigrams_to_speeches.items(): w.writerow([key, val]) bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(), key=lambda x: len(x[1]), reverse=True) w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w")) for item in bigrams_to_speakers_noplein_sorted: w.writerow([item[0], item[1]]) # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors # num_speeches = 4479 # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb')) with open('gir_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % mont_num_speeches) print num_speeches with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withlimit.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} # with open("Girondins_withlimit.pickle", 'wb') as handle: # pickle.dump(Girondins, handle, protocol = 0) # with open("Montagnards_withlimit.pickle", 'wb') as handle: # pickle.dump(Montagnards, handle, protocol = 0) # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # """with open("gir_tfidf.pickle", 'wb') as handle: # pickle.dump(gir_tfidf, handle, protocol = 0) # with open("mont_tfidf.pickle", 'wb') as handle: # pickle.dump(mont_tfidf, handle, protocol = 0)""" # # Computes the distance between the tf_idf vectors # #compute_distance(gir_tfidf, mont_tfidf) # # Stores the tf_idf vectors # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") # #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx') # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") # #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx') # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) # df_tfidf_combined = df_tfidf_combined.transpose() # df_tfidf_combined.columns = ["Girondins", "Montagnards"] # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches # print gir_docs Girondins = {k: v for k, v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # # Normalizes the vectors and computes the distance between them # #normalized = normalize_dicts(Girondins, Montagnards) # #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): speaker_names = set() speakers_to_consider = [] # Initialize various data frames for export to the classification script train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() unigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() speech_num = 0 for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Only looking at speeches with substance, so greater than 100 characters if len(raw_speeches[identity]) >= 100: indv_speech_bigram = compute_ngrams( raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams( raw_speeches[identity], 1) # Splitting the data into training and test data with 1/4 of each speaker's data in the test set if speech_num % 4 != 0: train_number_speeches += 1 for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) augment(train_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(unigram_doc_freq, unigram) augment(train_total_freq_unigram, unigram) train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: augment(test_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(test_total_freq_unigram, unigram) test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram speech_num += 1 # Write all relevant data objects and values to memory to use when running classification with open("speechid_to_speaker_store.pickle", 'wb') as handle: pickle.dump(speechid_to_speaker, handle, protocol=0) speechid_to_speaker = None with open("speakers_to_analyze_store.pickle", 'wb') as handle: pickle.dump(speakers_to_analyze, handle, protocol=0) speakers_to_analyze = None raw_speeches = None with open("train_speeches_bigram.pickle", 'wb') as handle: pickle.dump(train_speeches_bigram, handle, protocol=0) with open("train_speeches_unigram.pickle", 'wb') as handle: pickle.dump(train_speeches_unigram, handle, protocol=0) with open("train_total_freq_bigram.pickle", 'wb') as handle: pickle.dump(train_total_freq_bigram, handle, protocol=0) with open("train_total_freq_unigram.pickle", 'wb') as handle: pickle.dump(train_total_freq_unigram, handle, protocol=0) with open("bigram_doc_freq.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("unigram_doc_freq.pickle", 'wb') as handle: pickle.dump(unigram_doc_freq, handle, protocol=0) with open("train_number_speeches.pickle", 'wb') as handle: pickle.dump(train_number_speeches, handle, protocol=0) with open("test_speeches_bigram.pickle", 'wb') as handle: pickle.dump(test_speeches_bigram, handle, protocol=0) with open("test_speeches_unigram.pickle", 'wb') as handle: pickle.dump(test_speeches_unigram, handle, protocol=0) with open("test_total_freq_bigram.pickle", 'wb') as handle: pickle.dump(test_total_freq_bigram, handle, protocol=0) with open("test_total_freq_unigram.pickle", 'wb') as handle: pickle.dump(test_total_freq_unigram, handle, protocol=0)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 plein_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02"): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes speaker_name = speechid_to_speaker[identity] party = "" if speaker_name in speakers_to_consider: party = speakers_to_analyze.loc[speaker_name, "Party"] else: party = "Plein" augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram elif party == "Montagnards": mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one # or the other else: plein_num_speeches += 1 plein_docs = check_num_speakers(indv_speech_bigram, speaker_name, plein_docs) try: Plein = Plein + indv_speech_bigram except NameError: Plein = indv_speech_bigram #speech = speech + indv_speech_bigram # Stores the bigram Counter object for each individual speaker """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol = 0)""" """# Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')""" # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches print num_speeches with open('speaker_num_speeches_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withplein.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withplein.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_withplein.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_withplein.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("Girondins_withplein.pickle", 'wb') as handle: pickle.dump(Girondins, handle, protocol=0) with open("Montagnards_withplein.pickle", 'wb') as handle: pickle.dump(Montagnards, handle, protocol=0) with open("Plein.pickle", 'wb') as handle: pickle.dump(Plein, handle, protocol=0) gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq) """with open("gir_tfidf.pickle", 'wb') as handle: pickle.dump(gir_tfidf, handle, protocol = 0) with open("mont_tfidf.pickle", 'wb') as handle: pickle.dump(mont_tfidf, handle, protocol = 0)""" # Computes the distance between the tf_idf vectors #compute_distance(gir_tfidf, mont_tfidf) # Stores the tf_idf vectors df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index") #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx') df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches Girondins = {k: v for k, v in Girondins.items() if (v >= 3)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 3)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx") # Normalizes the vectors and computes the distance between them #normalized = normalize_dicts(Girondins, Montagnards) #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency.xlsx')
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) if speaker_name in speaker_ngrams: speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram else: speaker_ngrams[speaker_name] = indv_speech_bigram """ if speaker_name in chronology: pairing = chronology[speaker_name] for bigram in indv_speech_bigram: if bigram in pairing: pairing[bigram].append([identity, indv_speech_bigram[bigram]]) else: pairing[bigram] = [identity, indv_speech_bigram[bigram]] else: chronology[speaker_name] = {} pairing = chronology[speaker_name] for bigram in indv_speech_bigram: pairing[bigram] = [] # stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram pairing[bigram] = [identity, indv_speech_bigram[bigram]]""" ## Need tf-idf vectors for gir and mont ## Need the doc_freq for the previous calcuations ## compute tf-idf for individual speakers ## compute cosine distance based on those vectors (dot product over length of vectors) ## compute cosine similarity between the difference between the two group vectors (subtract from each other) ## A - B, if positive more like A, if negative more like B ## create tf vector for each speech and store that so can just add ## Separately store single idf vector ######### gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) doc_freq_dict = convert_keys_to_string(doc_freq) gir_mont_diff = compute_difference(gir_dict, mont_dict) #gir_dict = gir_tfidf #print gir_dict #mont_dict = mont_tfidf for speaker in speaker_ngrams: speaker_dict = convert_keys_to_string(speaker_ngrams[speaker]) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict) gir_dist = cosine_similarity(gir_dict, to_compare) mont_dist = cosine_similarity(mont_dict, to_compare) # Need to actually compute the distance gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare) speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist] """ #speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']} speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot']) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq) gir_dist = cosine_similarity(gir_dict, to_compare) df = pd.DataFrame([to_compare, gir_dict]) df = df.transpose() write_to_excel(df, "Francois Chabot Test.xlsx")""" """for speaker in speaker_ngrams: #to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)} to_compare = speaker_ngrams[speaker] gir_dict = gir_tfidf mont_dict = mont_tfidf gir_normalized = normalize_dicts(to_compare, gir_dict) gir_dist = compute_distance(gir_normalized[0], gir_normalized[1]) to_compare = speaker_ngrams[speaker] mont_normalized = normalize_dicts(to_compare, mont_dict) mont_dist = compute_distance(mont_normalized[0], mont_normalized[1]) speaker_distances[speaker] = [gir_dist, mont_dist]""" pickle_filename_3 = "speaker_ngrams.pickle" with open(pickle_filename_3, 'wb') as handle: pickle.dump(speaker_ngrams, handle, protocol = 0) df = pd.DataFrame.from_dict(speaker_distances) df = df.transpose() df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"] filename = "freq_dist_map.xlsx" writer = pd.ExcelWriter(filename) df.to_excel(writer, 'Sheet1') writer.save() pickle_filename = "freq_dist.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speaker_distances, handle, protocol = 0) """df2 = pd.DataFrame.from_dict(chronology)
def checkErrors(enc_words, french_stopwords): files = os.listdir("AP_ARTFL_vols/") errors_per_vol = {} errors_per_page = {} word_freq_wrong = {} for filename in files: if filename.endswith(".xml"): filename = open('AP_ARTFL_vols/' + filename, "r") volno = re.findall(vol_regex, str(filename))[0] print volno contents = filename.read() soup = BeautifulSoup(contents, 'lxml') num_errors = 0 num_words_vol = 0 word_freq = {} # Iterate through contents and find all page tags pb_tags = [] last_index = 0 while True: loc = contents.find("<pb n=", last_index) if loc == -1: break pb_tags.append(loc) last_index = loc + 1 # Iterates through all page tags and looks through the contents on each page, checking each word against the # words contained in the Encyclodpedie for i in range(0, len(pb_tags) - 1): contents_substr = contents[pb_tags[i]:pb_tags[i + 1]] page_num = BeautifulSoup(contents_substr, 'lxml').find_all('pb') pb_soup = BeautifulSoup(contents_substr, 'lxml') pageno = volno + "_pg" + page_num[0].get("n") error_per_page = 0 num_words_pg = 0 text = unicode(contents_substr, "ascii", errors="ignore") text = remove_diacritic(text).decode('utf-8') paragraph = remove_stopwords(text, french_stopwords) paragraph = paragraph.replace("\n", " ").replace( ")", "").replace("*", "").replace(":", "").replace( "-", "").replace("_", "").replace("(", "").replace( "& ", "").replace("; ", "").replace(".", "").replace( ",", "").replace("?", "").replace("!", "") paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph) words = paragraph.split(" ") num_words_vol += len(words) num_words_pg += len(words) for word in words: if word not in enc_words: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 error_per_page += 1 num_errors += 1 errors_per_page[pageno] = [error_per_page, num_words_pg] word_freq_wrong[volno] = sorted(word_freq.items(), key=lambda kv: kv[1]) errors_per_vol[volno] = [num_errors, num_words_vol] # Save and output errors per volume store_to_pickle(errors_per_vol, "errors_per_vol.pickle") w = csv.writer(open("errors_per_vol.csv", "w")) for key, val in errors_per_vol.items(): if isinstance(key, str): key = unicode(key, "ascii", errors="ignore") w.writerow([key, val[0], val[1]]) # Save and output errors per page store_to_pickle(error_per_page, "errors_per_page.pickle") w = csv.writer(open("errors_per_page.csv", "w")) for key, val in errors_per_page.items(): if isinstance(key, str): key = unicode(key, "ascii", errors="ignore") w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]]) # Save and output frequency of errors per word per volume store_to_pickle(word_freq_wrong, "word_freq_errors.pickle") w = csv.writer(open("word_freq_errors.csv", "w")) for key, val in word_freq_wrong.items(): w.writerow([key, val])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_num_speeches = {} speaker_char_count = {} # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Matches bigrams to the list of speakers and speeches that have that bigram bigrams_to_speeches = {} bigrams_to_speakers = {} # Maintains the number of documents a given bigram is spoken in for use with tf-idf bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram ### Maintains a Counter of all the bigrams and their counts for a given speaker # speech = speech + indv_speech_bigram ### Stores the bigram Counter object for each individual speaker # pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0) # Store raw counts store_to_pickle(Girondins,"Girondins.pickle") store_to_pickle(Montagnards, "Montagnards.pickle") # Store in memory aggregate information about each bigram bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches) ### If data has already been stored to memory, the lines below can be used # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) # bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb')) num_speeches = 4479 # Computes counts and tfidf scores for each party and outputs for further analysis in R counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq) """ EVERYTHING BELOW IS STORING DATA TO MEMORY """ # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index") write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx') df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') # Stores files in memory store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle") store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle") store_to_pickle(gir_docs, "gir_docs.pickle") store_to_pickle(mont_docs, "mont_docs.pickle") store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle") store_to_pickle(speaker_char_count, "speaker_char_count.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") with open('gir_speeches.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches.txt', 'w') as f: f.write('%d' % mont_num_speeches) write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv") write_to_csv(speaker_char_count, "speaker_char_count.csv") with open('num_speeches.txt', 'w') as f: f.write('%d' % num_speeches)
def findSpeeches(raw_speeches, daily_soup, date, volno): id_base = date.replace("/", "_") number_of_speeches = 0 for talk in daily_soup.find_all('sp'): # Tries to extract the speaker name and edits it for easier pairing with the Excel file try: speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace(".", "").replace(":", "").replace( "MM ", "").replace("MM. ", "").replace("M ", "").replace( "de ", "").replace("M. ", "").replace("M, ", "").replace( "M- ", "").replace("M; ", "").replace("M* ", "") if speaker.endswith(","): speaker = speaker[:-1] if speaker.endswith(", "): speaker = speaker[:-1] if speaker.startswith(' M. '): speaker = speaker[3:] if speaker.startswith(' '): speaker = speaker[1:] if speaker.endswith(' '): speaker = speaker[:-1] except AttributeError: speaker = "" # Piece together full speech if in multiple paragraph tags speech = talk.find_all('p') text = "" full_speech = "" for section in speech: text = text + section.get_text() full_speech = remove_diacritic(text).decode('utf-8') full_speech = full_speech.replace("\n", " ").replace("--", " ").replace("!", " ") full_speech = re.sub(r'([ ]{2,})', ' ', full_speech) full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech) # Speaker name is set to the full speaker name extracted from the Excel file speaker_name = "" # Only look at speeches not form the president if speaker != "Le President": if speaker in speaker_list.index.values: for j, name in enumerate(speaker_list.index.values): if speaker == name: speaker_name = speaker_list["FullName"].iloc[j] else: for i, name in enumerate(speaker_list['LastName']): # Ensures not looking at a list of speakers if (speaker.find(",") == -1) and (speaker.find(" et ") == -1): # Looks if speaker name embedded in any names in the Excel file if speaker.find(name) != -1: speaker_name = speaker_list["FullName"].iloc[i] # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized speakers_using_find.add( speaker + " : " + remove_diacritic( speaker_name).decode('utf-8') + "; " + str(volno) + "; " + str(date) + "\n") # Creates the unique speech id if speaker_name is not "": speaker_name = remove_diacritic(speaker_name).decode('utf-8') number_of_speeches = number_of_speeches + 1 if (speaker_name in speaker_num_total_speeches): speaker_num_total_speeches[ speaker_name] = speaker_num_total_speeches[speaker_name] + 1 else: speaker_num_total_speeches[speaker_name] = 1 if (speaker_name in speaker_num_total_chars): speaker_num_total_chars[ speaker_name] = speaker_num_total_chars[ speaker_name] + len(full_speech) else: speaker_num_total_chars[speaker_name] = len(full_speech) if id_base in speakers_per_session: speakers_per_session[id_base].add(speaker_name) else: speakers_per_session[id_base] = set() speakers_per_session[id_base].add(speaker_name) speakers.add(speaker_name) speech_id = "" + id_base + "_" + str(number_of_speeches) speechid_to_speaker[speech_id] = speaker_name raw_speeches[speech_id] = full_speech else: names_not_caught.add(speaker + "; " + str(volno) + "; " + str(date) + "\n") speeches_per_day[id_base] = number_of_speeches
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno): id_base = date.replace("/", "_") number_of_speeches = 0 presidents = [ ">le President", "Le President", "Mle President", "President", "le' President", "Le Preesident", "Le Preseident", "Le Presidant", "Le Presideait", "le Presiden", "le President", "Le president", "le president", "Le President,", "Le Presideut", "Le Presidtent", "le Presient", "le Presldent", "le'President" ] for talk in daily_soup.find_all('sp'): # Tries to extract the speaker name and edits it for easier pairing with the Excel file try: speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace(".", "").replace(":", "").replace( "MM ", "").replace("MM. ", "").replace("M ", "").replace( "de ", "").replace("M. ", "").replace("M, ", "").replace( "M- ", "").replace("M; ", "").replace("M* ", "") if speaker.endswith(","): speaker = speaker[:-1] if speaker.endswith(", "): speaker = speaker[:-1] if speaker.startswith(' M. '): speaker = speaker[3:] if speaker.startswith(' '): speaker = speaker[1:] if speaker.endswith(' '): speaker = speaker[:-1] except AttributeError: speaker = "" while talk.find("note"): ftnotes = talk.note.extract() # Piece together full speech if in multiple paragraph tags speech = talk.find_all('p') text = "" full_speech = "" for section in speech: text = text + " " + section.get_text() full_speech = remove_diacritic(text).decode('utf-8') full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech) full_speech = full_speech.replace("\n", " ").replace("--", " ").replace("!", " ") full_speech = re.sub(r'([ ]{2,})', ' ', full_speech) full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech) # Speaker name is set to the full speaker name extracted from the Excel file speaker_name = "" ##### # THIS IS THE INITIAL ATTEMPT AT SPEAKER DISAMBIGUATION # Only look at speeches not form the president if speaker not in presidents: if speaker in speaker_list.index.values: for j, name in enumerate(speaker_list.index.values): if speaker == name: speaker_name = speaker_list["FullName"].iloc[j] else: for i, name in enumerate(speaker_list['LastName']): # Ensures not looking at a list of speakers if (speaker.find(",") == -1) and (speaker.find(" et ") != -1): #only store multiple speakers when length of speech greater than 100 speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] else: # Looks if speaker name embedded in any names in the Excel file if speaker.find(name) != -1: speaker_name = speaker_list["FullName"].iloc[i] # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized speakers_using_find.add( speaker + " : " + remove_diacritic( speaker_name).decode('utf-8') + "; " + str(volno) + "; " + str(date) + "\n") else: speaker_name = "president" # Creates the unique speech id if (speaker_name is not "") and (speaker_name is not "multi") and ( speaker_name is not "president"): speaker_name = remove_diacritic(speaker_name).decode('utf-8') number_of_speeches = number_of_speeches + 1 if (speaker_name in speaker_num_total_speeches): speaker_num_total_speeches[ speaker_name] = speaker_num_total_speeches[speaker_name] + 1 else: speaker_num_total_speeches[speaker_name] = 1 if (speaker_name in speaker_num_total_chars): speaker_num_total_chars[ speaker_name] = speaker_num_total_chars[ speaker_name] + len(full_speech) else: speaker_num_total_chars[speaker_name] = len(full_speech) if id_base in speakers_per_session: speakers_per_session[id_base].add(speaker_name) else: speakers_per_session[id_base] = set() speakers_per_session[id_base].add(speaker_name) speakers.add(speaker_name) speech_id = "" + id_base + "_" + str(number_of_speeches) speechid_to_speaker[speech_id] = speaker_name raw_speeches[speech_id] = full_speech else: if (speaker_name is not "multi") and (speaker_name is not "president"): names_not_caught.add(speaker + "; " + str(volno) + "; " + str(date) + "\n") speeches_per_day[id_base] = number_of_speeches
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno): id_base = date.replace("/", "_") number_of_speeches = 0 presidents = [ ">le President", "Le President", "Mle President", "President", "le' President", "Le Preesident", "Le Preseident", "Le Presidant", "Le Presideait", "le Presiden", "le President", "Le president", "le president", "Le President,", "Le Presideut", "Le Presidtent", "le Presient", "le Presldent", "le'President" ] full_speaker_names = pickle.load(open("dated_names.pickle", "rb")) for talk in daily_soup.find_all('sp'): # Tries to extract the speaker name and edits it for easier pairing with the Excel file try: speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace("M.", "").replace("MM ", "").replace( "MM. ", "").replace("M ", "").replace("de ", "").replace( "M. ", "").replace("M, ", "").replace("M- ", "").replace( "M; ", "").replace("M* ", "").replace(".", "").replace( ":", "").replace("-", " ") if speaker.endswith(","): speaker = speaker[:-1] if speaker.endswith(", "): speaker = speaker[:-1] if speaker.startswith(' M. '): speaker = speaker[3:] if speaker.startswith(' '): speaker = speaker[1:] if speaker.endswith(' '): speaker = speaker[:-1] except AttributeError: speaker = "" speaker = speaker.lower() # Removes the footnotes while talk.find("note"): ftnotes = talk.note.extract() # Piece together full speech if in multiple paragraph tags speech = talk.find_all('p') text = "" full_speech = "" parano = 0 speaker_note = "" for section in speech: # Find information in parathenses, generally has the department name if parano == 0: para = section.get_text() if len(para) > 1: if para[0] == "(" or para[1] == "(": speaker_notes = re.findall(r'\([\s\S]{0,300}\)', para) if speaker_notes: speaker_note = speaker_notes[0] else: speaker_note = "" text = text + " " + section.get_text() parano += 1 full_speech = remove_diacritic(text).decode('utf-8') full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech) full_speech = full_speech.replace("\n", " ").replace("--", " ").replace("!", " ") full_speech = re.sub(r'([ ]{2,})', ' ', full_speech) full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech) # Conduct name_disambiguation full_speaker_names = read_names("APnames.xlsx") # full_speaker_names = pickle.load(open("dated_names.pickle", "rb")) if (speaker.find(",") != -1) and (speaker.find(" et ") != -1): #only store multiple speakers when length of speech greater than 100 speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] elif (speaker.find(" et ") != -1): speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] else: # Check to make sure have not already tried to disambiguate that speaker if speaker not in speakers_seen: matches = compute_speaker_Levenshtein_distance( speaker, full_speaker_names) speaker_dists.append( [speaker, matches, volno, date, speaker_note]) for full_speaker in matches: speaker_dists_split.append([ speaker, full_speaker[0], full_speaker[1], volno, date, speaker_note ]) speakers_seen.add(speaker)
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno): id_base = date.replace("/", "_") number_of_speeches = 0 presidents = [ ">le President", "Le President", "Mle President", "President", "le' President", "Le Preesident", "Le Preseident", "Le Presidant", "Le Presideait", "le Presiden", "le President", "Le president", "le president", "Le President,", "Le Presideut", "Le Presidtent", "le Presient", "le Presldent", "le'President" ] for talk in daily_soup.find_all('sp'): # Tries to extract the speaker name and edits it for easier pairing with the Excel file try: speaker = talk.find('speaker').get_text() speaker = remove_diacritic(speaker).decode('utf-8') speaker = speaker.replace(".", "").replace(":", "").replace( "-", " ").replace("MM ", "").replace("MM. ", "").replace( "M ", "").replace("de ", "").replace("M. ", "").replace( "M, ", "").replace("M- ", "").replace("M; ", "").replace("M* ", "") if speaker.endswith(","): speaker = speaker[:-1] if speaker.endswith(", "): speaker = speaker[:-1] if speaker.startswith(' M. '): speaker = speaker[3:] if speaker.startswith(' '): speaker = speaker[1:] if speaker.endswith(' '): speaker = speaker[:-1] except AttributeError: speaker = "" speaker = speaker.lower() # Removes the footnotes if talk.find("note"): ftnotes = talk.note.extract() ftnotes = remove_diacritic(ftnotes.get_text()).decode('utf-8') ftnotes = ftnotes.replace("\n", "").replace("\r", "").replace( "\t", "").replace(" ", " ") speech_id = "" + id_base + "_" + str(number_of_speeches + 1) footnotes.append([ftnotes, speaker, speech_id, volno]) if (speaker.find(",") != -1) and (speaker.find(" et ") != -1): #only store multiple speakers when length of speech greater than 100 speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] elif (speaker.find(" et ") != -1): speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] else: if speaker not in speakers_seen: matches = compute_speaker_Levenshtein_distance(speaker) speaker_dists.append([speaker, matches, volno, date]) for full_speaker in matches: speaker_dists_split.append([ speaker, full_speaker[0], full_speaker[1], volno, date ]) speakers_seen.add(speaker) # if speaker not in speaker_dists: # speaker_dists[speaker] = compute_speaker_Levenshtein_distance(speaker) # speakers_not_matched = [] # if speaker not in speaker_dists: # speaker_distances = compute_speaker_Levenshtein_distance(speaker) # # Need to look at only the top two and if less than or equal to 1 distance keep it, otherwise say not found # if speaker_distances[0][1] <= 1: # speaker = speaker_distances[0][0] # else: # speaker_dists[speaker] = speaker_distances # Piece together full speech if in multiple paragraph tags speech = talk.find_all('p') text = "" full_speech = "" for section in speech: text = text + " " + section.get_text() full_speech = remove_diacritic(text).decode('utf-8') full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech) full_speech = full_speech.replace("\n", " ").replace("--", " ").replace("!", " ") full_speech = re.sub(r'([ ]{2,})', ' ', full_speech) full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech) # Speaker name is set to the full speaker name extracted from the Excel file speaker_name = "" # Only look at speeches not form the president if speaker not in presidents: if speaker in speaker_list.index.values: for j, name in enumerate(speaker_list.index.values): if speaker == name: speaker_name = speaker_list["FullName"].iloc[j] else: for i, name in enumerate(speaker_list['LastName']): # Ensures not looking at a list of speakers if (speaker.find(",") != -1) and (speaker.find(" et ") != -1): #only store multiple speakers when length of speech greater than 100 speaker_name = "multi" if len(full_speech) >= 100: multiple_speakers[speaker] = [ full_speech, str(volno), str(date) ] else: # Looks if speaker name embedded in any names in the Excel file if speaker.find(name) != -1: speaker_name = speaker_list["FullName"].iloc[i] # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized speakers_using_find.add( speaker + " : " + remove_diacritic( speaker_name).decode('utf-8') + "; " + str(volno) + "; " + str(date) + "\n") else: speaker_name = "president" # Creates the unique speech id if (speaker_name is not "") and (speaker_name is not "multi") and ( speaker_name is not "president"): speaker_name = remove_diacritic(speaker_name).decode('utf-8') number_of_speeches = number_of_speeches + 1 if (speaker_name in speaker_num_total_speeches): speaker_num_total_speeches[ speaker_name] = speaker_num_total_speeches[speaker_name] + 1 else: speaker_num_total_speeches[speaker_name] = 1 if (speaker_name in speaker_num_total_chars): speaker_num_total_chars[ speaker_name] = speaker_num_total_chars[ speaker_name] + len(full_speech) else: speaker_num_total_chars[speaker_name] = len(full_speech) if id_base in speakers_per_session: speakers_per_session[id_base].add(speaker_name) else: speakers_per_session[id_base] = set() speakers_per_session[id_base].add(speaker_name) speakers.add(speaker_name) speech_id = "" + id_base + "_" + str(number_of_speeches) speechid_to_speaker[speech_id] = speaker_name raw_speeches[speech_id] = full_speech else: if (speaker_name is not "multi") and (speaker_name is not "president"): names_not_caught.add(speaker + "; " + str(volno) + "; " + str(date) + "\n") speeches_per_day[id_base] = number_of_speeches