def corpus_subject_object_freq(corp, genders, pickle_filepath=None): """ Takes in a Corpus of novels and genders to look for. Returns a dictionary of dictionaries, one for each gender Each dictionary maps each Document in the corpus to the proportion of the pronouns of the specified gender in that novel that are subject pronouns :param corp: Corpus object :param genders: a list of Gender objects to compare :param pickle_filepath: Location to store results; will not write a file if None :return: dictionary of results, by document and then by gender. >>> from corpus_analysis.corpus import Corpus >>> from gender_analysis.analysis.gender_frequency import corpus_subject_object_freq >>> from gender_analysis.common import MALE, FEMALE, BINARY_GROUP >>> from corpus_analysis.testing.common import TEST_CORPUS_PATH, SMALL_TEST_CORPUS_CSV >>> corpus = Corpus(TEST_CORPUS_PATH, csv_path=SMALL_TEST_CORPUS_CSV, ignore_warnings = True) >>> pronoun_freqs = corpus_subject_object_freq(corpus, BINARY_GROUP) >>> result = pronoun_freqs.popitem() >>> result[1][FEMALE] {'subj': 0.4872013651877133, 'obj': 0.5127986348122866} """ relative_freq = {} for document in corp.documents: relative_freq[document] = document_subject_object_freq( document, genders) if pickle_filepath: for gender in genders: gender_path = Path.join(pickle_filepath, gender.label) store_pickle(relative_freq, gender_path) return relative_freq
def generate_dependency_tree(document, genders=None, pickle_filepath=None): # pylint: disable=too-many-locals """ This function returns the dependency tree for a given document. This can optionally be reduced such that it will only analyze sentences that involve specified genders' subject/object pronouns. :param document: Document we are interested in :param genders: a collection of genders that will be used to filter out sentences that do not \ involve the provided genders. If set to `None`, all sentences are parsed (default). :param pickle_filepath: filepath to store pickled dependency tree, will not write a file if None :return: dependency tree, represented as a nested list """ parser = _get_parser_download_if_not_present() sentences = sent_tokenize(document.text.lower().replace("\n", " ")) # filter out sentences that are not relevant if genders is not None: filtered_sentences = list() # Find all of the words to filter around pronoun_filter = set() for gender in genders: pronoun_filter = pronoun_filter | gender.obj | gender.subj for sentence in sentences: add_sentence = True words = list(word_tokenize(sentence)) for word in words: if word in pronoun_filter: add_sentence = True if add_sentence: filtered_sentences.append(sentence) sentences = filtered_sentences result = parser.raw_parse_sents(sentences) # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag)) # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf tree = list(result) tree_list = [] i = 0 for sentence in tree: tree_list.append([]) triples = list(next(sentence).triples()) for triple in triples: tree_list[i].append(triple) i += 1 tree = tree_list if pickle_filepath is not None: common.store_pickle(tree, pickle_filepath) return tree
def store(self, pickle_filepath: str = 'gender_proximity_analysis.pgz') -> None: """ Saves self to a pickle file. :param pickle_filepath: filepath to save the output. :return: None, saves results as pickled file with name 'gender_tokens_analysis' """ try: load_pickle(pickle_filepath) user_inp = input("results already stored. overwrite previous analysis? (y/n)") if user_inp == 'y': store_pickle(self, pickle_filepath) else: pass except IOError: store_pickle(self, pickle_filepath)
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'): """ Saves the results from run_adj_analysis to a pickle file. :param results: dictionary of results from run_adj_analysis :param pickle_filepath: filepath to save the output :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis' """ try: load_pickle(pickle_filepath) user_inp = input( "results already stored. overwrite previous analysis? (y/n)") if user_inp == 'y': store_pickle(results, pickle_filepath) else: pass except IOError: store_pickle(results, pickle_filepath)
def __init__(self, path_to_files, name=None, csv_path=None, pickle_on_load=None, ignore_warnings=False): if isinstance(path_to_files, str): path_to_files = Path(path_to_files) if not isinstance(path_to_files, Path): raise ValueError( f'path_to_files must be a str or Path object, not type {type(path_to_files)}' ) self.name = name self.documents, self.metadata_fields =\ self._load_documents_and_metadata(path_to_files, csv_path, ignore_warnings=ignore_warnings) if pickle_on_load is not None: common.store_pickle(self, pickle_on_load)
def corpus_pronoun_freq(corp, genders, pickle_filepath=None): """ Counts gendered identifiers for every document in a given corpus, and finds their relative frequencies Returns a dictionary mapping each Document in the Corpus to the relative frequency of gendered identifiers in that Document :param corp: Corpus object :param genders: A list of Gender objects :param pickle_filepath: Filepath to save the pickled results. :return: dictionary with data organized by Document >>> from corpus_analysis.corpus import Corpus >>> from gender_analysis.analysis.gender_frequency import corpus_pronoun_freq >>> from gender_analysis.common import BINARY_GROUP >>> from corpus_analysis.testing.common import TEST_CORPUS_PATH, SMALL_TEST_CORPUS_CSV >>> c = Corpus(TEST_CORPUS_PATH, csv_path=SMALL_TEST_CORPUS_CSV, ignore_warnings = True) >>> pronoun_freq_dict = corpus_pronoun_freq(c, BINARY_GROUP) >>> flatland = c.get_document('title', 'Flatland') >>> result = pronoun_freq_dict[flatland] >>> pronoun_freq_dict[flatland] {'Female': 0.1494252873563218, 'Male': 0.8505747126436781} """ relative_freqs = {} for doc in corp.documents: comp_freq_dict = doc_pronoun_freq(doc, genders) relative_freqs[doc] = comp_freq_dict if pickle_filepath: store_pickle(relative_freqs, pickle_filepath) return relative_freqs
def dunning_total(counter1, counter2, pickle_filepath=None): """ Runs dunning_individual on words shared by both counter objects (-) end of spectrum is words for counter_2 (+) end of spectrum is words for counter_1 the larger the magnitude of the number, the more distinctive that word is in its respective counter object use pickle_filepath to store the result so it only has to be calculated once and can be used for multiple analyses. :param counter1: Python Counter object :param counter2: Python Counter object :param pickle_filepath: Filepath to store pickled results; will not save output if None :return: Dictionary >>> from collections import Counter >>> from gender_analysis.analysis.dunning import dunning_total >>> female_counter = Counter({'he': 1, 'she': 10, 'and': 10}) >>> male_counter = Counter({'he': 10, 'she': 1, 'and': 10}) >>> results = dunning_total(female_counter, male_counter) Results is a dict that maps from terms to results Each result dict contains the dunning score... >>> results['he']['dunning'] -8.547243830635558 ... counts for corpora 1 and 2 as well as total count >>> results['he']['count_total'], results['he']['count_corp1'], results['he']['count_corp2'] (11, 1, 10) ... and the same for frequencies >>> results['he']['freq_total'], results['he']['freq_corp1'], results['he']['freq_corp2'] (0.2619047619047619, 0.047619047619047616, 0.47619047619047616) """ total_words_counter1 = 0 total_words_counter2 = 0 # get word total in respective counters for word1 in counter1: total_words_counter1 += counter1[word1] for word2 in counter2: total_words_counter2 += counter2[word2] # dictionary where results will be returned dunning_result = {} for word in counter1: counter1_wordcount = counter1[word] if word in counter2: counter2_wordcount = counter2[word] if counter1_wordcount + counter2_wordcount < 10: continue dunning_word = dunn_individual_word(total_words_counter1, total_words_counter2, counter1_wordcount, counter2_wordcount) dunning_result[word] = { 'dunning': dunning_word, 'count_total': counter1_wordcount + counter2_wordcount, 'count_corp1': counter1_wordcount, 'count_corp2': counter2_wordcount, 'freq_total': (counter1_wordcount + counter2_wordcount) / (total_words_counter1 + total_words_counter2), 'freq_corp1': counter1_wordcount / total_words_counter1, 'freq_corp2': counter2_wordcount / total_words_counter2 } if pickle_filepath: store_pickle(dunning_result, pickle_filepath) return dunning_result