def subject_vs_object_pronoun_freqs(corp): ''' Takes in a Corpus of novels Returns a tuple of two dictionaries, one male and female Each dictionary maps each Novel in the corpus to the proportion of the pronouns of the specified gender in that novel that are subject pronouns #TODO: add doctests :param corp: Corpus :return: tuple of two dictionaries (male, female) >>> subject_vs_object_pronoun_freqs(Corpus('test_corpus')) ({<Novel (aanrud_longfrock)>: 0.793233082706767, <Novel (abbott_flatlandromance)>: 0.6741573033707865, <Novel (abbott_indiscreetletter)>: 0.7906976744186047, <Novel (adams_fighting)>: 0.7184527584020292, <Novel (alcott_josboys)>: 0.6330049261083744, <Novel (alcott_littlemen)>: 0.6451612903225807, <Novel (alcott_littlewomen)>: 0.6577563540753725, <Novel (alden_chautauqua)>: 0.7577030812324931, <Novel (austen_emma)>: 0.7086120401337792, <Novel (austen_persuasion)>: 0.6739130434782609}, {<Novel (aanrud_longfrock)>: 0.5376532399299474, <Novel (abbott_flatlandromance)>: 0.17543859649122806, <Novel (abbott_indiscreetletter)>: 0.4424242424242424, <Novel (adams_fighting)>: 0.43485915492957744, <Novel (alcott_josboys)>: 0.3862487360970678, <Novel (alcott_littlemen)>: 0.4343501326259947, <Novel (alcott_littlewomen)>: 0.4124569980083288, <Novel (alden_chautauqua)>: 0.5461432506887053, <Novel (austen_emma)>: 0.4836730221345606, <Novel (austen_persuasion)>: 0.4872013651877133}) ''' try: if (not corp.load_test_corpus): relative_freq_male_sub_v_ob = common.load_pickle( f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male') relative_freq_female_sub_v_ob = common.load_pickle( f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female') return (relative_freq_male_sub_v_ob, relative_freq_female_sub_v_ob) except IOError: pass relative_freq_male_subject = {} relative_freq_female_subject = {} relative_freq_male_object = {} relative_freq_female_object = {} for book in corp.novels: he = book.get_word_freq('he') him = book.get_word_freq('him') she = book.get_word_freq('she') her = book.get_word_freq('her') temp_dict_male = {'subject': he, 'object': him} temp_dict_female = {'subject': she, 'object': her} temp_dict_male = get_comparative_word_freq(temp_dict_male) temp_dict_female = get_comparative_word_freq(temp_dict_female) relative_freq_male_subject[book] = temp_dict_male['subject'] relative_freq_female_subject[book] = temp_dict_female['subject'] relative_freq_male_object[book] = temp_dict_male['object'] relative_freq_female_object[book] = temp_dict_female['object'] book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_male_subject, f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male') common.store_pickle(relative_freq_female_subject, f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female') result_tuple = (relative_freq_male_subject, relative_freq_female_subject) return result_tuple
def subject_pronouns_gender_comparison(corp, subject_gender): ''' Takes in a Corpus of novels and a gender. The gender determines whether the male frequency or female frequency will be returned Returns a dictionary of each novel in the Corpus mapped to the portion of the subject pronouns in the book that are of the specified gender :param corp: Corpus :param subject_gender: string 'male' or string 'female' :return: dictionary >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'male') {<Novel (aanrud_longfrock)>: 0.2557575757575758, <Novel (abbott_flatlandromance)>: 0.923076923076923, <Novel (abbott_indiscreetletter)>: 0.582857142857143, <Novel (adams_fighting)>: 0.8210144927536231, <Novel (alcott_josboys)>: 0.5736607142857142, <Novel (alcott_littlemen)>: 0.6812652068126521, <Novel (alcott_littlewomen)>: 0.39719502513892563, <Novel (alden_chautauqua)>: 0.2543488481429243, <Novel (austen_emma)>: 0.4343926191696566, <Novel (austen_persuasion)>: 0.45696623870660963} >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'female') {<Novel (aanrud_longfrock)>: 0.7442424242424243, <Novel (abbott_flatlandromance)>: 0.07692307692307691, <Novel (abbott_indiscreetletter)>: 0.4171428571428572, <Novel (adams_fighting)>: 0.17898550724637682, <Novel (alcott_josboys)>: 0.4263392857142857, <Novel (alcott_littlemen)>: 0.31873479318734793, <Novel (alcott_littlewomen)>: 0.6028049748610743, <Novel (alden_chautauqua)>: 0.7456511518570758, <Novel (austen_emma)>: 0.5656073808303435, <Novel (austen_persuasion)>: 0.5430337612933904} ''' if not(subject_gender == 'male' or subject_gender == 'female'): raise ValueError('subject_gender must be \'male\' or \'female\'') try: if (not corp.load_test_corpus): relative_freq_male_subject = common.load_pickle( f'{corp.corpus_name}_subject_pronoun_freq_male') relative_freq_female_subject = common.load_pickle( f'{corp.corpus_name}_subject_pronoun_freq_female') if subject_gender == 'male': return relative_freq_male_subject else: return relative_freq_female_subject except IOError: pass relative_freq_female_sub = {} relative_freq_male_sub = {} for book in corp.novels: he = book.get_word_freq('he') she = book.get_word_freq('she') relative_freq_female_sub[book] = (she)/(he+she) relative_freq_male_sub[book] = (he)/(he+she) book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_female_sub, f'{corp.corpus_name}_subject_pronoun_freq_female') common.store_pickle(relative_freq_male_sub, f'{corp.corpus_name}_subject_pronoun_freq_male') if subject_gender == 'male': return relative_freq_male_sub elif subject_gender == 'female': return relative_freq_female_sub else: raise ValueError('subject_gender must be \'male\' or \'female\'')
def store_raw_results(results, corpus_name): try: common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name) x = input("results already stored. overwrite previous analysis? (y/n)") if x == 'y': common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name) else: pass except IOError: common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name)
def books_pronoun_freq(corp): ''' Counts male and female pronouns for every book and finds their relative frequencies per book Outputs dictionary mapping novel object to the relative frequency of female pronouns in that book :param: Corpus object :return: dictionary with data organized by groups >>> books_pronoun_freq(Corpus('test_corpus')) {<Novel (aanrud_longfrock)>: 0.7623169107856191, <Novel (abbott_flatlandromance)>: 0.14321608040201003, <Novel (abbott_indiscreetletter)>: 0.4166666666666667, <Novel (adams_fighting)>: 0.1898395721925134, <Novel (alcott_josboys)>: 0.42152086422368146, <Novel (alcott_littlemen)>: 0.3111248200699157, <Novel (alcott_littlewomen)>: 0.6196978175713487, <Novel (alden_chautauqua)>: 0.7518623169791935, <Novel (austen_emma)>: 0.5662100456621004, <Novel (austen_persuasion)>: 0.5305111461382571} ''' try: if (not corp.load_test_corpus): relative_freq_male = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_male') relative_freq_female = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_female') return relative_freq_female except IOError: pass relative_freq_male = {} relative_freq_female = {} for book in corp.novels: he = book.get_word_freq('he') him = book.get_word_freq('him') his = book.get_word_freq('his') male = he + him + his she = book.get_word_freq('she') her = book.get_word_freq('her') hers = book.get_word_freq('hers') female = she + her + hers temp_dict = {'male': male, 'female': female} temp_dict = get_comparative_word_freq(temp_dict) relative_freq_male[book] = temp_dict['male'] relative_freq_female[book] = temp_dict['female'] book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_male, f'{corp.corpus_name}_pronoun_freq_male') common.store_pickle(relative_freq_female, f'{corp.corpus_name}_pronoun_freq_female') return (relative_freq_female)
def run_analysis(corpus_name): print("loading corpus", corpus_name) corpus = Corpus(corpus_name) novels = corpus.novels print("running analysis") results = run_adj_analysis(novels) print("storing results") store_raw_results(results, corpus_name) r = common.load_pickle("pronoun_adj_raw_analysis"+corpus_name) m = merge_raw_results(r) final = get_overlapping_adjectives_raw_results(m) common.store_pickle(final, "pronoun_adj_final_results"+corpus_name) #Comment out pprint for large databases where it's not practical to print out results pprint(final)
def pickle(novel, parser): """ This function returns a pickled tree :param novel: Novel we are interested in :param parser: Stanford parser object :return: tree in pickle format >>> tree = load_pickle(f'dep_tree_aanrud_longfrock') >>> tree == None False """ try: tree = load_pickle(f'dep_tree_{str(novel)}') except (IOError, FileNotFoundError): sentences = sent_tokenize(novel.text.lower().replace("\n", " ")) he_she_sentences = [] for sentence in sentences: add_sentence = False words = [word for word in word_tokenize(sentence)] for word in words: if word == "he" or word == "she" or word == "him" or word == "her": add_sentence = True if add_sentence: he_she_sentences.append(sentence) sentences = he_she_sentences result = parser.raw_parse_sents(sentences) # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag)) # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf tree = list(result) tree_list = [] i = 0 for sentence in tree: tree_list.append([]) triples = list(next(sentence).triples()) for triple in triples: tree_list[i].append(triple) i += 1 tree = tree_list store_pickle(tree, f'dep_tree_{str(novel)}') return tree
def run_analysis(corpus_name): """ print("loading corpus", corpus_name) corpus = Corpus(corpus_name) novels = corpus.novels print("running analysis") results = run_adj_analysis(novels) print("storing results") store_raw_results(results, corpus_name) print("loading results") r = common.load_pickle("pronoun_adj_raw_analysis_"+corpus_name) print("merging and getting final results") m = merge_raw_results(r) print("getting final results") final = get_overlapping_adjectives_raw_results(m) print("storing final results") common.store_pickle(final, "pronoun_adj_final_results_"+corpus_name) #Comment out pprint for large databases where it's not practical to print out results #pprint(final) """ r = common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name) print("getting results by location") r2 = results_by_location(r) print("storing 1") common.store_pickle(r2, "pronoun_adj_by_location") print("getting results by author gender") r3 = results_by_author_gender(r) print("storing 2") common.store_pickle(r3, "pronoun_adj_by_author_gender") print("getting results by date") r4 = results_by_date(r) print("storing 3") common.store_pickle(r4, "pronoun_adj_by_date") print("DONE")
def run_analysis(corpus_name): """ Run instance distance analyses on a particular corpus and saves results as pickle files. Comment out sections of code or analyses that have already been run or are unnecessary. :param corpus_name: :return: """ print('loading corpus') corpus = Corpus(corpus_name) novels = corpus.novels print('running analysis') results = run_distance_analysis(novels) print('storing results') store_raw_results(results, corpus_name) r = common.load_pickle("instance_distance_raw_analysis_" + corpus_name) r2 = results_by_location(r, "mean") r3 = results_by_author_gender(r, "mean") r4 = results_by_date(r, "median") r5 = results_by_location(r, "median") r6 = results_by_author_gender(r, "median") r7 = results_by_date(r, "median") common.store_pickle(r2, "mean_instance_distances_by_location_" + corpus_name) common.store_pickle( r3, "mean_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r4, "mean_instance_distances_by_date_" + corpus_name) common.store_pickle(r5, "median_instance_distances_by_location_" + corpus_name) common.store_pickle( r6, "median_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r7, "median_instance_distances_by_date_" + corpus_name) pvals = get_p_vals("gutenberg") common.store_pickle(pvals, "instance_distance_comparison_pvals") male_top_twenty, female_top_twenty, diff_top_twenty = get_highest_distances( "gutenberg", 20) top_twenties = { 'male_pronoun_top_twenty': male_top_twenty, 'female_pronoun_top_twenty': female_top_twenty, "difference_top_twenty": diff_top_twenty } common.store_pickle(top_twenties, "instance_distance_top_twenties")
def dunning_total(counter1, counter2, filename_to_pickle=None): ''' runs dunning_individual on words shared by both counter objects (-) end of spectrum is words for counter_2 (+) end of spectrum is words for counter_1 the larger the magnitude of the number, the more distinctive that word is in its respective counter object use filename_to_pickle to store the result so it only has to be calculated once and can be used for multiple analyses. >>> from collections import Counter >>> female_counter = Counter({'he': 1, 'she': 10, 'and': 10}) >>> male_counter = Counter({'he': 10, 'she': 1, 'and': 10}) >>> results = dunning_total(female_counter, male_counter) # Results is a dict that maps from terms to results # Each result dict contains the dunning score... >>> results['he']['dunning'] -8.547243830635558 # ... counts for corpora 1 and 2 as well as total count >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2'] (11, 1, 10) # ... and the same for frequencies >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2'] (0.2619047619047619, 0.047619047619047616, 0.47619047619047616) :return: dict ''' total_words_counter1 = 0 total_words_counter2 = 0 #get word total in respective counters for word1 in counter1: total_words_counter1 += counter1[word1] for word2 in counter2: total_words_counter2 += counter2[word2] #dictionary where results will be returned dunning_result = {} for word in counter1: counter1_wordcount = counter1[word] if word in counter2: counter2_wordcount = counter2[word] if counter1_wordcount + counter2_wordcount < 10: continue dunning_word = dunn_individual_word( total_words_counter1, total_words_counter2, counter1_wordcount,counter2_wordcount) dunning_result[word] = { 'dunning': dunning_word, 'count_total': counter1_wordcount + counter2_wordcount, 'count_corp1': counter1_wordcount, 'count_corp2': counter2_wordcount, 'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 + total_words_counter2), 'freq_corp1': counter1_wordcount / total_words_counter1, 'freq_corp2': counter2_wordcount / total_words_counter2 } if filename_to_pickle: store_pickle(dunning_result, filename_to_pickle) return dunning_result