def subject_vs_object_pronoun_freqs(corp):
    '''
    Takes in a Corpus of novels
    Returns a tuple of two dictionaries, one male and female
    Each dictionary maps each Novel in the corpus to the proportion of the pronouns
        of the specified gender in that novel that are subject pronouns

    #TODO: add doctests

    :param corp: Corpus
    :return: tuple of two dictionaries (male, female)

    >>> subject_vs_object_pronoun_freqs(Corpus('test_corpus'))
    ({<Novel (aanrud_longfrock)>: 0.793233082706767, <Novel (abbott_flatlandromance)>: 0.6741573033707865, <Novel (abbott_indiscreetletter)>: 0.7906976744186047, <Novel (adams_fighting)>: 0.7184527584020292, <Novel (alcott_josboys)>: 0.6330049261083744, <Novel (alcott_littlemen)>: 0.6451612903225807, <Novel (alcott_littlewomen)>: 0.6577563540753725, <Novel (alden_chautauqua)>: 0.7577030812324931, <Novel (austen_emma)>: 0.7086120401337792, <Novel (austen_persuasion)>: 0.6739130434782609}, {<Novel (aanrud_longfrock)>: 0.5376532399299474, <Novel (abbott_flatlandromance)>: 0.17543859649122806, <Novel (abbott_indiscreetletter)>: 0.4424242424242424, <Novel (adams_fighting)>: 0.43485915492957744, <Novel (alcott_josboys)>: 0.3862487360970678, <Novel (alcott_littlemen)>: 0.4343501326259947, <Novel (alcott_littlewomen)>: 0.4124569980083288, <Novel (alden_chautauqua)>: 0.5461432506887053, <Novel (austen_emma)>: 0.4836730221345606, <Novel (austen_persuasion)>: 0.4872013651877133})
    '''

    try:
        if (not corp.load_test_corpus):
            relative_freq_male_sub_v_ob = common.load_pickle(
                f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male')
            relative_freq_female_sub_v_ob = common.load_pickle(
                f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female')
            return (relative_freq_male_sub_v_ob, relative_freq_female_sub_v_ob)
    except IOError:
        pass

    relative_freq_male_subject = {}
    relative_freq_female_subject = {}
    relative_freq_male_object = {}
    relative_freq_female_object = {}

    for book in corp.novels:
        he = book.get_word_freq('he')
        him = book.get_word_freq('him')

        she = book.get_word_freq('she')
        her = book.get_word_freq('her')

        temp_dict_male = {'subject': he, 'object': him}
        temp_dict_female = {'subject': she, 'object': her}
        temp_dict_male = get_comparative_word_freq(temp_dict_male)
        temp_dict_female = get_comparative_word_freq(temp_dict_female)

        relative_freq_male_subject[book] = temp_dict_male['subject']
        relative_freq_female_subject[book] = temp_dict_female['subject']
        relative_freq_male_object[book] = temp_dict_male['object']
        relative_freq_female_object[book] = temp_dict_female['object']

    book.text = ''
    book._word_counts_counter = None

    if (not corp.load_test_corpus):
        common.store_pickle(relative_freq_male_subject,
                            f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male')
        common.store_pickle(relative_freq_female_subject,
                            f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female')

    result_tuple = (relative_freq_male_subject, relative_freq_female_subject)

    return result_tuple
def subject_pronouns_gender_comparison(corp, subject_gender):
    '''
    Takes in a Corpus of novels and a gender.
    The gender determines whether the male frequency or female frequency will
        be returned
    Returns a dictionary of each novel in the Corpus mapped to the portion of
        the subject pronouns in the book that are of the specified gender
    :param corp: Corpus
    :param subject_gender: string 'male' or string 'female'
    :return: dictionary

    >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'male')
    {<Novel (aanrud_longfrock)>: 0.2557575757575758, <Novel (abbott_flatlandromance)>: 0.923076923076923, <Novel (abbott_indiscreetletter)>: 0.582857142857143, <Novel (adams_fighting)>: 0.8210144927536231, <Novel (alcott_josboys)>: 0.5736607142857142, <Novel (alcott_littlemen)>: 0.6812652068126521, <Novel (alcott_littlewomen)>: 0.39719502513892563, <Novel (alden_chautauqua)>: 0.2543488481429243, <Novel (austen_emma)>: 0.4343926191696566, <Novel (austen_persuasion)>: 0.45696623870660963}
    >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'female')
    {<Novel (aanrud_longfrock)>: 0.7442424242424243, <Novel (abbott_flatlandromance)>: 0.07692307692307691, <Novel (abbott_indiscreetletter)>: 0.4171428571428572, <Novel (adams_fighting)>: 0.17898550724637682, <Novel (alcott_josboys)>: 0.4263392857142857, <Novel (alcott_littlemen)>: 0.31873479318734793, <Novel (alcott_littlewomen)>: 0.6028049748610743, <Novel (alden_chautauqua)>: 0.7456511518570758, <Novel (austen_emma)>: 0.5656073808303435, <Novel (austen_persuasion)>: 0.5430337612933904}
    '''

    if not(subject_gender == 'male' or subject_gender == 'female'):
        raise ValueError('subject_gender must be \'male\' or \'female\'')

    try:
        if (not corp.load_test_corpus):
            relative_freq_male_subject = common.load_pickle(
                f'{corp.corpus_name}_subject_pronoun_freq_male')
            relative_freq_female_subject = common.load_pickle(
                f'{corp.corpus_name}_subject_pronoun_freq_female')
            if subject_gender == 'male':
                return relative_freq_male_subject
            else:
                return relative_freq_female_subject
    except IOError:
        pass

    relative_freq_female_sub = {}
    relative_freq_male_sub = {}

    for book in corp.novels:
        he = book.get_word_freq('he')
        she = book.get_word_freq('she')

        relative_freq_female_sub[book] = (she)/(he+she)
        relative_freq_male_sub[book] = (he)/(he+she)

    book.text = ''
    book._word_counts_counter = None

    if (not corp.load_test_corpus):
        common.store_pickle(relative_freq_female_sub,
                            f'{corp.corpus_name}_subject_pronoun_freq_female')
        common.store_pickle(relative_freq_male_sub, f'{corp.corpus_name}_subject_pronoun_freq_male')

    if subject_gender == 'male':
        return relative_freq_male_sub
    elif subject_gender == 'female':
        return relative_freq_female_sub
    else:
        raise ValueError('subject_gender must be \'male\' or \'female\'')
def store_raw_results(results, corpus_name):
    try:
        common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name)
        x = input("results already stored. overwrite previous analysis? (y/n)")
        if x == 'y':
            common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name)
        else:
            pass
    except IOError:
        common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name)
def books_pronoun_freq(corp):
    '''
    Counts male and female pronouns for every book and finds their relative frequencies per book
    Outputs dictionary mapping novel object to the relative frequency
        of female pronouns in that book

    :param: Corpus object
    :return: dictionary with data organized by groups

    >>> books_pronoun_freq(Corpus('test_corpus'))
    {<Novel (aanrud_longfrock)>: 0.7623169107856191, <Novel (abbott_flatlandromance)>: 0.14321608040201003, <Novel (abbott_indiscreetletter)>: 0.4166666666666667, <Novel (adams_fighting)>: 0.1898395721925134, <Novel (alcott_josboys)>: 0.42152086422368146, <Novel (alcott_littlemen)>: 0.3111248200699157, <Novel (alcott_littlewomen)>: 0.6196978175713487, <Novel (alden_chautauqua)>: 0.7518623169791935, <Novel (austen_emma)>: 0.5662100456621004, <Novel (austen_persuasion)>: 0.5305111461382571}
    '''

    try:
        if (not corp.load_test_corpus):
            relative_freq_male = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_male')
            relative_freq_female = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_female')
            return relative_freq_female
    except IOError:
        pass

    relative_freq_male = {}
    relative_freq_female = {}

    for book in corp.novels:
        he = book.get_word_freq('he')
        him = book.get_word_freq('him')
        his = book.get_word_freq('his')
        male = he + him + his

        she = book.get_word_freq('she')
        her = book.get_word_freq('her')
        hers = book.get_word_freq('hers')
        female = she + her + hers

        temp_dict = {'male': male, 'female': female}
        temp_dict = get_comparative_word_freq(temp_dict)

        relative_freq_male[book] = temp_dict['male']
        relative_freq_female[book] = temp_dict['female']

    book.text = ''
    book._word_counts_counter = None

    if (not corp.load_test_corpus):
        common.store_pickle(relative_freq_male, f'{corp.corpus_name}_pronoun_freq_male')
        common.store_pickle(relative_freq_female, f'{corp.corpus_name}_pronoun_freq_female')

    return (relative_freq_female)
def run_analysis(corpus_name):
    print("loading corpus", corpus_name)
    corpus = Corpus(corpus_name)
    novels = corpus.novels

    print("running analysis")
    results = run_adj_analysis(novels)

    print("storing results")
    store_raw_results(results, corpus_name)

    r = common.load_pickle("pronoun_adj_raw_analysis"+corpus_name)
    m = merge_raw_results(r)
    final = get_overlapping_adjectives_raw_results(m)
    common.store_pickle(final, "pronoun_adj_final_results"+corpus_name)

    #Comment out pprint for large databases where it's not practical to print out results
    pprint(final)
def pickle(novel, parser):
    """
    This function returns a pickled tree
    :param novel: Novel we are interested in
    :param parser: Stanford parser object
    :return: tree in pickle format

    >>> tree = load_pickle(f'dep_tree_aanrud_longfrock')
    >>> tree == None
    False
    """

    try:
        tree = load_pickle(f'dep_tree_{str(novel)}')
    except (IOError, FileNotFoundError):
        sentences = sent_tokenize(novel.text.lower().replace("\n", " "))
        he_she_sentences = []
        for sentence in sentences:
            add_sentence = False
            words = [word for word in word_tokenize(sentence)]
            for word in words:
                if word == "he" or word == "she" or word == "him" or word == "her":
                    add_sentence = True
            if add_sentence:
                he_she_sentences.append(sentence)
        sentences = he_she_sentences
        result = parser.raw_parse_sents(sentences)
        # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag))
        # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf
        tree = list(result)
        tree_list = []
        i = 0
        for sentence in tree:
            tree_list.append([])
            triples = list(next(sentence).triples())
            for triple in triples:
                tree_list[i].append(triple)
            i += 1
        tree = tree_list
        store_pickle(tree, f'dep_tree_{str(novel)}')
    return tree
예제 #7
0
def run_analysis(corpus_name):
    """
    print("loading corpus", corpus_name)
    corpus = Corpus(corpus_name)
    novels = corpus.novels

    print("running analysis")
    results = run_adj_analysis(novels)

    print("storing results")
    store_raw_results(results, corpus_name)

    print("loading results")
    r = common.load_pickle("pronoun_adj_raw_analysis_"+corpus_name)
    print("merging and getting final results")
    m = merge_raw_results(r)
    print("getting final results")
    final = get_overlapping_adjectives_raw_results(m)
    print("storing final results")
    common.store_pickle(final, "pronoun_adj_final_results_"+corpus_name)

    #Comment out pprint for large databases where it's not practical to print out results
    #pprint(final)
    """
    r = common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name)
    print("getting results by location")
    r2 = results_by_location(r)
    print("storing 1")
    common.store_pickle(r2, "pronoun_adj_by_location")
    print("getting results by author gender")
    r3 = results_by_author_gender(r)
    print("storing 2")
    common.store_pickle(r3, "pronoun_adj_by_author_gender")
    print("getting results by date")
    r4 = results_by_date(r)
    print("storing 3")
    common.store_pickle(r4, "pronoun_adj_by_date")
    print("DONE")
def run_analysis(corpus_name):
    """
    Run instance distance analyses on a particular corpus and saves results as pickle files.
    Comment out sections of code or analyses that have already been run or are unnecessary.
    :param corpus_name:
    :return:
    """
    print('loading corpus')
    corpus = Corpus(corpus_name)
    novels = corpus.novels

    print('running analysis')
    results = run_distance_analysis(novels)

    print('storing results')
    store_raw_results(results, corpus_name)

    r = common.load_pickle("instance_distance_raw_analysis_" + corpus_name)
    r2 = results_by_location(r, "mean")
    r3 = results_by_author_gender(r, "mean")
    r4 = results_by_date(r, "median")
    r5 = results_by_location(r, "median")
    r6 = results_by_author_gender(r, "median")
    r7 = results_by_date(r, "median")

    common.store_pickle(r2,
                        "mean_instance_distances_by_location_" + corpus_name)
    common.store_pickle(
        r3, "mean_instance_distances_by_author_gender_" + corpus_name)
    common.store_pickle(r4, "mean_instance_distances_by_date_" + corpus_name)

    common.store_pickle(r5,
                        "median_instance_distances_by_location_" + corpus_name)
    common.store_pickle(
        r6, "median_instance_distances_by_author_gender_" + corpus_name)
    common.store_pickle(r7, "median_instance_distances_by_date_" + corpus_name)

    pvals = get_p_vals("gutenberg")
    common.store_pickle(pvals, "instance_distance_comparison_pvals")

    male_top_twenty, female_top_twenty, diff_top_twenty = get_highest_distances(
        "gutenberg", 20)
    top_twenties = {
        'male_pronoun_top_twenty': male_top_twenty,
        'female_pronoun_top_twenty': female_top_twenty,
        "difference_top_twenty": diff_top_twenty
    }
    common.store_pickle(top_twenties, "instance_distance_top_twenties")
예제 #9
0
def dunning_total(counter1, counter2, filename_to_pickle=None):
    '''
    runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use filename_to_pickle to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    >>> from collections import Counter
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    # Results is a dict that maps from terms to results
    # Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    # ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    # ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    :return: dict

    '''

    total_words_counter1 = 0
    total_words_counter2 = 0

    #get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in  counter2:
        total_words_counter2 += counter2[word2]

    #dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]


            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word( total_words_counter1,  total_words_counter2,
                                                 counter1_wordcount,counter2_wordcount)

            dunning_result[word] = {
                'dunning': dunning_word,
                'count_total': counter1_wordcount + counter2_wordcount,
                'count_corp1': counter1_wordcount,
                'count_corp2': counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 +
                                                                           total_words_counter2),
                'freq_corp1': counter1_wordcount / total_words_counter1,
                'freq_corp2': counter2_wordcount / total_words_counter2
            }

    if filename_to_pickle:
        store_pickle(dunning_result, filename_to_pickle)

    return dunning_result