示例#1
0
def subject_pronouns_gender_comparison(corp, subject_gender, pickle_filepath_male=None, pickle_filepath_female=None):
    """
    Takes in a Corpus of novels and a gender.
    The gender determines whether the male frequency or female frequency will be returned.

    Returns a dictionary of each novel in the Corpus mapped to the portion of the subject
    pronouns in the book that are of the specified gender.

    :param corp: Corpus object
    :param subject_gender: string 'male' or string 'female'
    :param pickle_filepath_male: Location to store results for male results; will not write a file if None
    :param pickle_filepath_female: Location to store results for female results; will not write a file if None
    :return: dictionary

    >>> from gender_analysis.corpus import Corpus
    >>> from gender_analysis.common import TEST_DATA_PATH
    >>> filepath = TEST_DATA_PATH / 'test_corpus'
    >>> csvpath = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv'
    >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'male')
    {<Document (aanrud_longfrock)>: 0.25724637681159424, <Document (abbott_flatlandromance)>: 0.9051094890510949, <Document (abbott_indiscreetletter)>: 0.5842696629213483, <Document (adams_fighting)>: 0.8206796818510484, <Document (alcott_josboys)>: 0.5742904841402336, <Document (alcott_littlemen)>: 0.6829615567157096, <Document (alcott_littlewomen)>: 0.3974087784241142, <Document (alden_chautauqua)>: 0.2549295774647887, <Document (austen_emma)>: 0.43709109209864117, <Document (austen_persuasion)>: 0.45726495726495725}
    >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'female')
    {<Document (aanrud_longfrock)>: 0.7427536231884058, <Document (abbott_flatlandromance)>: 0.0948905109489051, <Document (abbott_indiscreetletter)>: 0.4157303370786517, <Document (adams_fighting)>: 0.17932031814895155, <Document (alcott_josboys)>: 0.42570951585976624, <Document (alcott_littlemen)>: 0.3170384432842905, <Document (alcott_littlewomen)>: 0.6025912215758857, <Document (alden_chautauqua)>: 0.7450704225352113, <Document (austen_emma)>: 0.5629089079013588, <Document (austen_persuasion)>: 0.5427350427350427}

    """

    if not(subject_gender == 'male' or subject_gender == 'female'):
        raise ValueError('subject_gender must be \'male\' or \'female\'')

    try:
        relative_freq_male_subject = common.load_pickle(pickle_filepath_male)
        relative_freq_female_subject = common.load_pickle(pickle_filepath_female)
        if subject_gender == 'male':
            return relative_freq_male_subject
        else:
            return relative_freq_female_subject
    except IOError:
        pass

    relative_freq_female_sub = {}
    relative_freq_male_sub = {}

    for book in corp.documents:
        he = book.get_word_freq('he')
        she = book.get_word_freq('she')

        relative_freq_female_sub[book] = she/(he+she)
        relative_freq_male_sub[book] = he/(he+she)

    if pickle_filepath_male and pickle_filepath_female:
        common.store_pickle(relative_freq_female_sub,
                            pickle_filepath_female)
        common.store_pickle(relative_freq_male_sub, pickle_filepath_male)

    if subject_gender == 'male':
        return relative_freq_male_sub
    elif subject_gender == 'female':
        return relative_freq_female_sub
    else:
        raise ValueError('subject_gender must be \'male\' or \'female\'')
示例#2
0
def subject_vs_object_pronoun_freqs(corp,
                                    pickle_filepath_male=None,
                                    pickle_filepath_female=None):
    """
    Takes in a Corpus of novels
    Returns a tuple of two dictionaries, one male and female
    Each dictionary maps each Document in the corpus to the proportion of the pronouns
    of the specified gender in that novel that are subject pronouns

    :param corp: Corpus object
    :param pickle_filepath_female: Location to store results for male results; will not write a file if None
    :param pickle_filepath_male: Location to store results for female results; will not write a file if None
    :return: tuple of two dictionaries (male, female)

    >>> from gender_analysis.corpus import Corpus
    >>> from gender_analysis.common import TEST_DATA_PATH
    >>> filepath = TEST_DATA_PATH / 'test_corpus'
    >>> csvpath = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv'
    >>> subject_vs_object_pronoun_freqs(Corpus(filepath, csv_path=csvpath))
    ({<Document (aanrud_longfrock)>: 0.7947761194029851, <Document (abbott_flatlandromance)>: 0.6777777777777777, <Document (abbott_indiscreetletter)>: 0.7938931297709924, <Document (adams_fighting)>: 0.7188093730208993, <Document (alcott_josboys)>: 0.6334563345633456, <Document (alcott_littlemen)>: 0.6454880294659301, <Document (alcott_littlewomen)>: 0.6580560420315237, <Document (alden_chautauqua)>: 0.7583798882681564, <Document (austen_emma)>: 0.7088554720133667, <Document (austen_persuasion)>: 0.6743697478991596}, {<Document (aanrud_longfrock)>: 0.5380577427821522, <Document (abbott_flatlandromance)>: 0.18965517241379312, <Document (abbott_indiscreetletter)>: 0.4457831325301205, <Document (adams_fighting)>: 0.4358523725834798, <Document (alcott_josboys)>: 0.38655886811520973, <Document (alcott_littlemen)>: 0.43472498343273697, <Document (alcott_littlewomen)>: 0.41256335988414194, <Document (alden_chautauqua)>: 0.5462994836488813, <Document (austen_emma)>: 0.48378615249780893, <Document (austen_persuasion)>: 0.48742004264392325})

    """

    relative_freq_male_subject = {}
    relative_freq_female_subject = {}
    relative_freq_male_object = {}
    relative_freq_female_object = {}

    for book in corp.documents:
        # pronouns are hard-coded because these are the only ones guaranteed as subjects and objects
        he = book.get_word_freq('he')
        him = book.get_word_freq('him')

        she = book.get_word_freq('she')
        her = book.get_word_freq('her')

        temp_dict_male = {'subject': he, 'object': him}
        temp_dict_female = {'subject': she, 'object': her}
        temp_dict_male = get_comparative_word_freq(temp_dict_male)
        temp_dict_female = get_comparative_word_freq(temp_dict_female)

        relative_freq_male_subject[book] = temp_dict_male['subject']
        relative_freq_female_subject[book] = temp_dict_female['subject']
        relative_freq_male_object[book] = temp_dict_male['object']
        relative_freq_female_object[book] = temp_dict_female['object']

    if pickle_filepath_male and pickle_filepath_female:
        common.store_pickle(relative_freq_male_subject, pickle_filepath_male)
        common.store_pickle(relative_freq_female_subject,
                            pickle_filepath_female)

    return relative_freq_male_subject, relative_freq_female_subject
示例#3
0
    def __init__(self, path_to_files, name=None, csv_path=None,
                       pickle_on_load=None, guess_author_genders=False):

        if isinstance(path_to_files, str):
            path_to_files = Path(path_to_files)
        if not isinstance(path_to_files, Path):
            raise ValueError(f'path_to_files must be a str or Path object, not type {type(path_to_files)}')

        self.name = name
        self.documents, self.metadata_fields = self._load_documents_and_metadata(path_to_files,
                                                                                 csv_path)
        if guess_author_genders:
            self.guess_author_genders()

        if pickle_on_load is not None:
            common.store_pickle(self, pickle_on_load)
示例#4
0
def document_pronoun_freq(corp, pickle_filepath=None):
    """
    Counts male and female pronouns for every document in a given corpus,
    and finds their relative frequencies

    Returns a dictionary mapping each Document in the Corpus to the frequency
    of female pronouns in that Document

    :param corp: Corpus object
    :return: dictionary with data organized by Document

    >>> from gender_analysis.corpus import Corpus
    >>> from gender_analysis.analysis.gender_frequency import document_pronoun_freq
    >>> from gender_analysis.common import TEST_DATA_PATH
    >>> filepath = TEST_DATA_PATH / 'test_corpus'
    >>> csvpath = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv'
    >>> c = Corpus(filepath, csv_path=csvpath)
    >>> pronoun_freq_dict = document_pronoun_freq(c)
    >>> flatland = c.get_document('title', 'Flatland')
    >>> result = pronoun_freq_dict[flatland]
    >>> format(result, '.5f')
    '0.15068'

    """

    relative_freq_male = {}
    relative_freq_female = {}

    for doc in corp.documents:
        male = 0
        for word in common.MASC_WORDS:
            male += doc.get_word_freq(word)

        female = 0
        for word in common.FEM_WORDS:
            female += doc.get_word_freq(word)

        temp_dict = {'male': male, 'female': female}
        temp_dict = get_comparative_word_freq(temp_dict)

        relative_freq_male[doc] = temp_dict['male']
        relative_freq_female[doc] = temp_dict['female']

    if pickle_filepath:
        common.store_pickle(relative_freq_female, pickle_filepath)

    return relative_freq_female
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'):
    """
    Saves the results from run_adj_analysis to a pickle file.

    :param results: dictionary of results from run_adj_analysis
    :param pickle_filepath: filepath to save the output
    :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis'

    """
    try:
        common.load_pickle(pickle_filepath)
        x = input("results already stored. overwrite previous analysis? (y/n)")
        if x == 'y':
            common.store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        common.store_pickle(results, pickle_filepath)
def store_raw_results(results,
                      pickle_filepath='instance_distance_raw_analysis.pgz'):
    """
    Stores results from an analysis as a pickle file.

    :param results: A Python object that can be pickled
    :param pickle_filepath: Destination for pickle file
    :return: None
    """
    try:
        common.load_pickle(pickle_filepath)
        x = input("results already stored. overwrite previous analysis? (y/n)")
        if x == 'y':
            common.store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        common.store_pickle(results, pickle_filepath)
def generate_dependency_tree(document, pickle_filepath=None):
    """
    This function returns the dependency tree for a given document.

    :param document: Document we are interested in
    :param pickle_filepath: filepath to store pickled dependency tree, will not write a file if None
    :return: dependency tree, represented as a nested list

    """

    parser = _get_parser_download_if_not_present()
    sentences = sent_tokenize(document.text.lower().replace("\n", " "))
    he_she_sentences = []
    for sentence in sentences:
        add_sentence = False
        words = [word for word in word_tokenize(sentence)]
        for word in words:
            if word == "he" or word == "she" or word == "him" or word == "her":
                add_sentence = True
        if add_sentence:
            he_she_sentences.append(sentence)
    sentences = he_she_sentences
    result = parser.raw_parse_sents(sentences)

    # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag))
    # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf
    tree = list(result)
    tree_list = []
    i = 0
    for sentence in tree:
        tree_list.append([])
        triples = list(next(sentence).triples())
        for triple in triples:
            tree_list[i].append(triple)
        i += 1
    tree = tree_list

    if pickle_filepath is not None:
        common.store_pickle(tree, pickle_filepath)

    return tree
示例#8
0
def dunning_total(counter1, counter2, pickle_filepath=None):
    """
    Runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use pickle_filepath to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    :param counter1: Python Counter object
    :param counter2: Python Counter object
    :param pickle_filepath: Filepath to store pickled results; will not save output if None
    :return: Dictionary

    >>> from collections import Counter
    >>> from gender_analysis.analysis.dunning import dunning_total
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    Results is a dict that maps from terms to results
    Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    """

    total_words_counter1 = 0
    total_words_counter2 = 0

    # get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in counter2:
        total_words_counter2 += counter2[word2]

    # dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]

            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word(total_words_counter1,
                                                total_words_counter2,
                                                counter1_wordcount,
                                                counter2_wordcount)

            dunning_result[word] = {
                'dunning':
                dunning_word,
                'count_total':
                counter1_wordcount + counter2_wordcount,
                'count_corp1':
                counter1_wordcount,
                'count_corp2':
                counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) /
                (total_words_counter1 + total_words_counter2),
                'freq_corp1':
                counter1_wordcount / total_words_counter1,
                'freq_corp2':
                counter2_wordcount / total_words_counter2
            }

    if pickle_filepath:
        store_pickle(dunning_result, pickle_filepath)

    return dunning_result