Exemplo n.º 1
0
def corpus_subject_object_freq(corp, genders, pickle_filepath=None):
    """
    Takes in a Corpus of novels and genders to look for.
    Returns a dictionary of dictionaries, one for each gender
    Each dictionary maps each Document in the corpus to the proportion of the pronouns
    of the specified gender in that novel that are subject pronouns

    :param corp: Corpus object
    :param genders: a list of Gender objects to compare
    :param pickle_filepath: Location to store results; will not write a file if None
    :return: dictionary of results, by document and then by gender.

    >>> from corpus_analysis.corpus import Corpus
    >>> from gender_analysis.analysis.gender_frequency import corpus_subject_object_freq
    >>> from gender_analysis.common import MALE, FEMALE, BINARY_GROUP
    >>> from corpus_analysis.testing.common import TEST_CORPUS_PATH, SMALL_TEST_CORPUS_CSV
    >>> corpus = Corpus(TEST_CORPUS_PATH, csv_path=SMALL_TEST_CORPUS_CSV, ignore_warnings = True)
    >>> pronoun_freqs = corpus_subject_object_freq(corpus, BINARY_GROUP)
    >>> result = pronoun_freqs.popitem()
    >>> result[1][FEMALE]
    {'subj': 0.4872013651877133, 'obj': 0.5127986348122866}
    """
    relative_freq = {}

    for document in corp.documents:
        relative_freq[document] = document_subject_object_freq(
            document, genders)

    if pickle_filepath:
        for gender in genders:
            gender_path = Path.join(pickle_filepath, gender.label)
            store_pickle(relative_freq, gender_path)

    return relative_freq
Exemplo n.º 2
0
def generate_dependency_tree(document, genders=None, pickle_filepath=None):
    # pylint: disable=too-many-locals
    """
    This function returns the dependency tree for a given document. This can optionally be reduced
    such that it will only analyze sentences that involve specified genders' subject/object
    pronouns.

    :param document: Document we are interested in
    :param genders: a collection of genders that will be used to filter out sentences that do not \
        involve the provided genders. If set to `None`, all sentences are parsed (default).
    :param pickle_filepath: filepath to store pickled dependency tree, will not write a file if None
    :return: dependency tree, represented as a nested list

    """

    parser = _get_parser_download_if_not_present()
    sentences = sent_tokenize(document.text.lower().replace("\n", " "))

    # filter out sentences that are not relevant
    if genders is not None:
        filtered_sentences = list()

        # Find all of the words to filter around
        pronoun_filter = set()
        for gender in genders:
            pronoun_filter = pronoun_filter | gender.obj | gender.subj

        for sentence in sentences:
            add_sentence = True

            words = list(word_tokenize(sentence))
            for word in words:
                if word in pronoun_filter:
                    add_sentence = True
            if add_sentence:
                filtered_sentences.append(sentence)
        sentences = filtered_sentences

    result = parser.raw_parse_sents(sentences)

    # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag))
    # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf
    tree = list(result)
    tree_list = []
    i = 0
    for sentence in tree:
        tree_list.append([])
        triples = list(next(sentence).triples())
        for triple in triples:
            tree_list[i].append(triple)
        i += 1
    tree = tree_list

    if pickle_filepath is not None:
        common.store_pickle(tree, pickle_filepath)

    return tree
Exemplo n.º 3
0
    def store(self, pickle_filepath: str = 'gender_proximity_analysis.pgz') -> None:
        """
        Saves self to a pickle file.

        :param pickle_filepath: filepath to save the output.
        :return: None, saves results as pickled file with name 'gender_tokens_analysis'
        """

        try:
            load_pickle(pickle_filepath)
            user_inp = input("results already stored. overwrite previous analysis? (y/n)")
            if user_inp == 'y':
                store_pickle(self, pickle_filepath)
            else:
                pass
        except IOError:
            store_pickle(self, pickle_filepath)
Exemplo n.º 4
0
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'):
    """
    Saves the results from run_adj_analysis to a pickle file.

    :param results: dictionary of results from run_adj_analysis
    :param pickle_filepath: filepath to save the output
    :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis'

    """
    try:
        load_pickle(pickle_filepath)
        user_inp = input(
            "results already stored. overwrite previous analysis? (y/n)")
        if user_inp == 'y':
            store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        store_pickle(results, pickle_filepath)
Exemplo n.º 5
0
    def __init__(self,
                 path_to_files,
                 name=None,
                 csv_path=None,
                 pickle_on_load=None,
                 ignore_warnings=False):
        if isinstance(path_to_files, str):
            path_to_files = Path(path_to_files)

        if not isinstance(path_to_files, Path):
            raise ValueError(
                f'path_to_files must be a str or Path object, not type {type(path_to_files)}'
            )

        self.name = name
        self.documents, self.metadata_fields =\
            self._load_documents_and_metadata(path_to_files,
                                              csv_path,
                                              ignore_warnings=ignore_warnings)

        if pickle_on_load is not None:
            common.store_pickle(self, pickle_on_load)
Exemplo n.º 6
0
def corpus_pronoun_freq(corp, genders, pickle_filepath=None):
    """
    Counts gendered identifiers for every document in a given corpus,
    and finds their relative frequencies

    Returns a dictionary mapping each Document in the Corpus to the relative frequency of
    gendered identifiers in that Document

    :param corp: Corpus object
    :param genders: A list of Gender objects
    :param pickle_filepath: Filepath to save the pickled results.
    :return: dictionary with data organized by Document

    >>> from corpus_analysis.corpus import Corpus
    >>> from gender_analysis.analysis.gender_frequency import corpus_pronoun_freq
    >>> from gender_analysis.common import BINARY_GROUP
    >>> from corpus_analysis.testing.common import TEST_CORPUS_PATH, SMALL_TEST_CORPUS_CSV
    >>> c = Corpus(TEST_CORPUS_PATH, csv_path=SMALL_TEST_CORPUS_CSV, ignore_warnings = True)
    >>> pronoun_freq_dict = corpus_pronoun_freq(c, BINARY_GROUP)
    >>> flatland = c.get_document('title', 'Flatland')
    >>> result = pronoun_freq_dict[flatland]
    >>> pronoun_freq_dict[flatland]
    {'Female': 0.1494252873563218, 'Male': 0.8505747126436781}

    """

    relative_freqs = {}

    for doc in corp.documents:
        comp_freq_dict = doc_pronoun_freq(doc, genders)
        relative_freqs[doc] = comp_freq_dict

    if pickle_filepath:
        store_pickle(relative_freqs, pickle_filepath)

    return relative_freqs
Exemplo n.º 7
0
def dunning_total(counter1, counter2, pickle_filepath=None):
    """
    Runs dunning_individual on words shared by both counter objects
    (-) end of spectrum is words for counter_2
    (+) end of spectrum is words for counter_1
    the larger the magnitude of the number, the more distinctive that word is in its
    respective counter object

    use pickle_filepath to store the result so it only has to be calculated once and can be
    used for multiple analyses.

    :param counter1: Python Counter object
    :param counter2: Python Counter object
    :param pickle_filepath: Filepath to store pickled results; will not save output if None
    :return: Dictionary

    >>> from collections import Counter
    >>> from gender_analysis.analysis.dunning import dunning_total
    >>> female_counter = Counter({'he': 1,  'she': 10, 'and': 10})
    >>> male_counter =   Counter({'he': 10, 'she': 1,  'and': 10})
    >>> results = dunning_total(female_counter, male_counter)

    Results is a dict that maps from terms to results
    Each result dict contains the dunning score...
    >>> results['he']['dunning']
    -8.547243830635558

    ... counts for corpora 1 and 2 as well as total count
    >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2']
    (11, 1, 10)

    ... and the same for frequencies
    >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2']
    (0.2619047619047619, 0.047619047619047616, 0.47619047619047616)

    """

    total_words_counter1 = 0
    total_words_counter2 = 0

    # get word total in respective counters
    for word1 in counter1:
        total_words_counter1 += counter1[word1]
    for word2 in counter2:
        total_words_counter2 += counter2[word2]

    # dictionary where results will be returned
    dunning_result = {}
    for word in counter1:
        counter1_wordcount = counter1[word]
        if word in counter2:
            counter2_wordcount = counter2[word]

            if counter1_wordcount + counter2_wordcount < 10:
                continue

            dunning_word = dunn_individual_word(total_words_counter1,
                                                total_words_counter2,
                                                counter1_wordcount,
                                                counter2_wordcount)

            dunning_result[word] = {
                'dunning':
                dunning_word,
                'count_total':
                counter1_wordcount + counter2_wordcount,
                'count_corp1':
                counter1_wordcount,
                'count_corp2':
                counter2_wordcount,
                'freq_total': (counter1_wordcount + counter2_wordcount) /
                (total_words_counter1 + total_words_counter2),
                'freq_corp1':
                counter1_wordcount / total_words_counter1,
                'freq_corp2':
                counter2_wordcount / total_words_counter2
            }

    if pickle_filepath:
        store_pickle(dunning_result, pickle_filepath)

    return dunning_result