Exemplo n.º 1
0
def female_characters_author_gender_differences(
        corpus,
        to_pickle=False,
        pickle_filename='dunning_female_chars_author_gender.pgz'):
    """
    Between male-author and female-author subcorpora, tests distinctiveness of words associated
    with male characters

    Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc.

    :param corpus: Corpus object
    :param to_pickle: boolean, False by default. Set to True in order to pickle results
    :param pickle_filename: filename of results to be pickled
    :return: dict

    """

    if 'author_gender' not in corpus.metadata_fields:
        raise MissingMetadataError(['author_gender'])

    m_corpus = corpus.filter_by_gender('male')
    f_corpus = corpus.filter_by_gender('female')

    return compare_word_association_between_corpus_dunning(
        FEM_WORDS,
        m_corpus,
        f_corpus,
        word_window=None,
        to_pickle=to_pickle,
        pickle_filename=pickle_filename)
Exemplo n.º 2
0
    def get_field_vals(self, field):
        """
        This function returns a sorted list of the values present in the corpus for a given metadata field.

        :param field: field to search for (i.e. 'location', 'author_gender', etc.)
        :return: list of strings

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> path = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> csvpath = TEST_DATA_PATH / 'sample_novels' / 'sample_novels.csv'
        >>> c = Corpus(path, name='sample_novels', csv_path=csvpath)
        >>> c.get_field_vals('author_gender')
        ['both', 'female', 'male']

        """

        if field not in self.metadata_fields:
            raise MissingMetadataError([field])

        values = set()
        for document in self.documents:
            values.add(getattr(document, field))

        return sorted(list(values))
Exemplo n.º 3
0
    def count_authors_by_gender(self, gender):
        """
        This function returns the number of authors in the corpus with the specified gender.

        *NOTE:* there must be an 'author_gender' field in the metadata of all documents.

        :param gender: gender identifier to search for in the metadata (i.e. 'female', 'male', etc.)
        :return: Number of authors of the given gender

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> path = TEST_DATA_PATH / 'test_corpus'
        >>> path_to_csv = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv'
        >>> c = Corpus(path, csv_path=path_to_csv)
        >>> c.count_authors_by_gender('female')
        7

        """
        count = 0
        for document in self.documents:
            try:
                if document.author_gender.lower() == gender.lower():
                    count += 1
            except AttributeError:
                raise MissingMetadataError(['author_gender'])

        return count
Exemplo n.º 4
0
def plot_gender_breakdown(corpus, filename=None):
    """
    Creates a pie chart displaying the composition of male and female writers in the data.

    *NOTE:* Requires that corpus contains a 'author_gender' metadata field.

    :param corpus: Corpus object
    :param filename: Name of file to save plot as; will not write a file if None
    :return: None

    """
    if 'author_gender' not in corpus.metadata_fields:
        raise MissingMetadataError(['author_gender'])

    pub_gender = []
    for doc in corpus.documents:
        if doc.author_gender is None:
            continue
        pub_gender.append(doc.author_gender)

    if corpus.name:
        corpus_name = corpus.name
    else:
        corpus_name = 'corpus'

    sns.set_color_codes('colorblind')
    gendercount = {}
    for i in pub_gender:
        if i == 'both' or i == 'unknown' or i == 'Both' or i == 'Unknown':
            gendercount['Unknown'] = gendercount.setdefault('Unknown', 0)+1
        else:
            gendercount[i] = gendercount.setdefault(i, 0)+1
    total = 0
    for i in gendercount:
        total += gendercount[i]
    slices = [gendercount[i]/total for i in gendercount]
    genders = [i for i in gendercount]
    labelgenders = []
    for i in range(len(genders)):
        labelgenders.append((genders[i]+': ' + str(int(round(slices[i], 2)*100))+'%').title())
    colors = ['c', 'b', 'g']
    plt.figure(figsize=(10, 6))
    plt.pie(slices, colors=colors, labels=labelgenders, textprops={'fontsize': 15})
    plt.title('Gender Breakdown for '+corpus_name.title(), size=18, color='k', weight='bold')
    plt.legend()
    plt.subplots_adjust(left=.1, bottom=.1, right=.9, top=.9)

    if filename:
        plt.savefig(filename.replace(' ', '_')+'.png')
    else:
        plt.savefig('gender_breakdown_for_'+corpus_name.replace(' ', '_')+'.png')
Exemplo n.º 5
0
    def multi_filter(self, characteristic_dict):
        """
        Returns a copy of the corpus, but with only the documents that fulfill the metadata parameters passed in by
        characteristic_dict. Multiple metadata keys can be searched at one time, provided that the metadata is
        available for the documents in the corpus.


        :param characteristic_dict: Dictionary with metadata fields as keys and search terms as values
        :return: Corpus object

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> path = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> path_to_csv = TEST_DATA_PATH / 'sample_novels' / 'sample_novels.csv'
        >>> c = Corpus(path, csv_path=path_to_csv)
        >>> corpus_filter = {'author_gender': 'male'}
        >>> len(c.multi_filter(corpus_filter))
        59

        >>> corpus_filter['filename'] = 'aanrud_longfrock.txt'
        >>> len(c.multi_filter(corpus_filter))
        1
        """

        corpus_copy = self.clone()
        corpus_copy.documents = []

        for metadata_field in characteristic_dict:
            if metadata_field not in self.metadata_fields:
                raise MissingMetadataError([metadata_field])

        for this_document in self.documents:
            add_document = True
            for metadata_field in characteristic_dict:
                if metadata_field == 'date':
                    if this_document.date != int(characteristic_dict['date']):
                        add_document = False
                else:
                    if getattr(this_document, metadata_field
                               ) != characteristic_dict[metadata_field]:
                        add_document = False
            if add_document:
                corpus_copy.documents.append(this_document)

        if not corpus_copy:
            # displays for possible errors in field.value
            err = f'This corpus is empty. You may have mistyped something.'
            raise AttributeError(err)

        return corpus_copy
Exemplo n.º 6
0
def plot_metadata_pie(corpus, filename=None):
    """
    Creates a pie chart indicating the fraction of metadata that is filled in the corpus.

    *NOTE:* Requires that corpus contains 'author_gender' and 'country_publication' metadata fields.

    :param corpus: Corpus object
    :param filename: Name of file to save plot as; will not write a file if None
    :return: None

    """

    if ('author_gender' not in corpus.metadata_fields
        or 'country_publication' not in corpus.metadata_fields):
        raise MissingMetadataError(['author_gender', 'country_publication'])

    if corpus.name:
        name = corpus.name
    else:
        name = 'corpus'

    counter = Counter({'Both Country and Gender': 0, 'Author Gender Only': 0,
                       'Country Only': 0, 'Neither': 0})
    num_documents = len(corpus)
    for doc in corpus.documents:
        if doc.author_gender and doc.author_gender != 'unknown' and doc.country_publication:
            counter['Both Country and Gender'] += 1
        elif doc.author_gender and doc.author_gender != 'unknown':
            counter['Author Gender Only'] += 1
        elif doc.country_publication:
            counter['Country Only'] += 1
        else:
            counter['Neither'] += 1
    labels = []
    for label, number in counter.items():
        labels.append(label + " " + str(int(round(number/num_documents, 2)*100)) + r"%")
    sns.set_color_codes('colorblind')
    colors = ['c', 'b', 'g', 'w']
    plt.figure(figsize=(10, 6))
    plt.pie(counter.values(), colors=colors, labels=labels, textprops={'fontsize': 13})
    plt.title('Percentage Acquired Metadata for ' + name.title(), size=18, color='k',
              weight='bold')
    plt.legend()
    plt.subplots_adjust(left=.1, bottom=.1, right=.9, top=.9)

    if filename:
        plt.savefig(filename.replace(' ', '_') + '.png')
    else:
        plt.savefig('percentage_acquired_metadata_for_' + name.replace(' ', '_') + '.png')
Exemplo n.º 7
0
def dunning_words_by_author_gender(
        corpus,
        display_results=False,
        to_pickle=False,
        pickle_filename='dunning_male_vs_female_authors.pgz'):
    """
    Tests distinctiveness of shared words between male and female authors using dunning analysis.

    If called with display_results=True, prints out the most distinctive terms overall as well as
    grouped by verbs, adjectives etc.
    Returns a dict of all terms in the corpus mapped to the dunning data for each term

    :param corpus: Corpus object
    :param display_results: Boolean; reports a visualization of the results if True
    :param to_pickle: Boolean; Will save the results to a pickle file if True
    :param pickle_filename: Path to pickle object; will try to search for results in this location or write pickle file to path if to_pickle is true.
    :return: dict

    """

    if 'author_gender' not in corpus.metadata_fields:
        raise MissingMetadataError(['author_gender'])

    # By default, try to load precomputed results. Only calculate if no stored results are
    # available.
    try:
        results = load_pickle(pickle_filename)
    except IOError:

        m_corpus = corpus.filter_by_gender('male')
        f_corpus = corpus.filter_by_gender('female')
        wordcounter_male = m_corpus.get_wordcount_counter()
        wordcounter_female = f_corpus.get_wordcount_counter()
        if to_pickle:
            results = dunning_total(wordcounter_female,
                                    wordcounter_male,
                                    filename_to_pickle=pickle_filename)
        else:
            results = dunning_total(wordcounter_female, wordcounter_male)

    if display_results:
        for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
            dunning_result_displayer(results,
                                     number_of_terms_to_display=20,
                                     corpus1_display_name='Fem Author',
                                     corpus2_display_name='Male Author',
                                     part_of_speech_to_include=group)
    return results
Exemplo n.º 8
0
def plot_pubyears(corpus, filename=None):
    """
    Creates a histogram displaying the frequency of books that were published within a 20 year 
    period.

    *NOTE:* Requires that corpus contains a 'date' metadata field.

    :param corpus: Corpus object
    :param filename: Name of file to save plot as; will not write a file if None
    :return: None

    """

    if 'date' not in corpus.metadata_fields:
        raise MissingMetadataError(['date'])
      
    pub_years = []
    for doc in corpus.documents:
        if doc.date is None:
            continue
        pub_years.append(doc.date)

    if corpus.name:
        corpus_name = corpus.name
    else:
        corpus_name = 'corpus'

    sns.set_style('ticks')
    sns.color_palette('colorblind')
    ax1 = plt.subplot2grid((1, 1), (0, 0))
    plt.figure(figsize=(10, 6))
    bins = [num for num in range(min(pub_years), max(pub_years)+4, 5)]
    plt.hist(pub_years, bins, histtype='bar', rwidth=.8, color='c')
    plt.xlabel('Year', size=15, weight='bold', color='k')
    plt.ylabel('Frequency', size=15, weight='bold', color='k')
    plt.title('Publication Year Concentration for '+corpus_name.title(), size=18, weight='bold',
              color='k')
    plt.yticks(size=15, color='k')
    plt.xticks([i for i in range(min(pub_years), max(pub_years)+9, 10)], size=15, color='k')
    for label in ax1.xaxis.get_ticklabels():
        label.set_rotation(60)
    plt.subplots_adjust(left=.1, bottom=.18, right=.95, top=.9)

    if filename:
        plt.savefig(filename.replace(' ', '_')+'.png')
    else:
        plt.savefig('date_of_pub_for_'+corpus_name.replace(' ', '_')+'.png')
Exemplo n.º 9
0
    def get_document(self, metadata_field, field_val):
        """
        Returns a specific Document object from self.documents that has metadata matching field_val for
        metadata_field.

        This function will only return the first document in self.documents. It should only be used if you're certain
        there is only one match in the Corpus or if you're not picky about which Document you get.  If you want more
        selectivity use **get_document_multiple_fields**, or if you want multiple documents,
        use **subcorpus**.

        :param metadata_field: metadata field to search
        :param field_val: search term
        :return: Document Object

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH, MissingMetadataError
        >>> path = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> csvpath = TEST_DATA_PATH / 'sample_novels' / 'sample_novels.csv'
        >>> c = Corpus(path, csv_path=csvpath)
        >>> c.get_document("author", "Dickens, Charles")
        <Document (dickens_twocities)>
        >>> c.get_document("date", '1857')
        <Document (bronte_professor)>
        >>> try:
        ...     c.get_document("meme_quality", "over 9000")
        ... except MissingMetadataError as exception:
        ...     print(exception)
        This Corpus is missing the following metadata field:
            meme_quality
        In order to run this function, you must create a new metadata csv
        with this field and run Corpus.update_metadata().

        """

        if metadata_field not in self.metadata_fields:
            raise MissingMetadataError([metadata_field])

        if metadata_field == "date":
            field_val = int(field_val)

        for document in self.documents:
            if getattr(document, metadata_field) == field_val:
                return document

        raise ValueError("Document not found")
Exemplo n.º 10
0
    def guess_author_genders(self):
        if 'author' not in self.metadata_fields:
            raise MissingMetadataError(['author'], 'Cannot guess author gender if no author '
                                                   'metadata is provided.')
        self.metadata_fields.append('author_gender')

        detector = gender.Detector()
        for doc in self.documents:
            if doc.author is None:
                continue

            if hasattr(doc, 'country_publication'):
                guess = detector.get_gender(doc.author.split(' ', 1)[0], doc.country_publication)
            else:
                guess = detector.get_gender(doc.author.split(' ', 1)[0])

            if guess == 'female' or guess == 'mostly_female':
                doc.author_gender = 'female'
            elif guess == 'male' or guess == 'mostly_male':
                doc.author_gender = 'male'
            else:  # guess == 'unknown' or guess == 'andy'
                doc.author_gender = 'unknown'
Exemplo n.º 11
0
    def get_document_multiple_fields(self, metadata_dict):
        """
        Returns a specific Document object from the corpus that has metadata matching a given metadata dict.

        This method will only return the first document in the corpus.  It should only be used if you're certain
        there is only one match in the Corpus or if you're not picky about which Document you get.  If you want
        multiple documents, use **subcorpus**.

        :param metadata_dict: Dictionary with metadata fields as keys and search terms as values
        :return: Document object

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> path = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> csvpath = TEST_DATA_PATH / 'sample_novels' / 'sample_novels.csv'
        >>> c = Corpus(path, csv_path=csvpath)
        >>> c.get_document_multiple_fields({"author": "Dickens, Charles", "author_gender": "male"})
        <Document (dickens_twocities)>
        >>> c.get_document_multiple_fields({"author": "Chopin, Kate", "title": "The Awakening"})
        <Document (chopin_awakening)>

        """

        for field in metadata_dict.keys():
            if field not in self.metadata_fields:
                raise MissingMetadataError([field])

        for document in self.documents:
            match = True
            for field, val in metadata_dict.items():
                if getattr(document, field, None) != val:
                    match = False
            if match:
                return document

        raise ValueError("Document not found")
Exemplo n.º 12
0
def plot_pubcountries(corpus, filename=None):
    """
    Creates a bar graph displaying the frequency of books that were published in each country.

    *NOTE:* Requires that corpus contains a 'country_publication' metadata field.

    :param corpus: Corpus object
    :param filename: Name of file to save plot as; will not write a file if None
    :return: None

    """
    if 'country_publication' not in corpus.metadata_fields:
        raise MissingMetadataError(['country_publication'])

    pub_country = []
    for doc in corpus.documents:
        if doc.country_publication is None:
            continue
        pub_country.append(doc.country_publication)

    if corpus.name:
        corpus_name = corpus.name
    else:
        corpus_name = 'corpus'

    sns.set_style('ticks')
    sns.color_palette('colorblind')
    plt.figure(figsize=(10, 6))
    ax1 = plt.subplot2grid((1, 1), (0, 0))
    country_counter = {}
    totalbooks = 0
    for country in pub_country:
        country_counter[country] = country_counter.setdefault(country, 0)+1
        totalbooks += 1
    country_counter2 = {'Other': 0}
    for country in country_counter:
        if country == '':
            pass
        elif country_counter[country] > (.001*totalbooks):
            # must be higher than .1% of the total books to have its own country name,
            # otherwise it is classified under others
            country_counter2[country] = country_counter[country]
        else:
            country_counter2['Other'] += country_counter[country]
    country_counter2 = sorted(country_counter2.items(), key=lambda kv: -kv[1])
    x = [country[0] for country in country_counter2]
    y = [country[1] for country in country_counter2]
    for label in ax1.xaxis.get_ticklabels():
        label.set_rotation(15)
    plt.bar(x, y, color='c')
    plt.xlabel('Countries', size=15, weight='bold', color='k')
    plt.ylabel('Frequency', size=15, weight='bold', color='k')
    plt.title('Country of Publication for '+corpus_name.title(), size=18, color='k',
              weight='bold')
    plt.xticks(color='k', size=15)
    plt.yticks(color='k', size=15)
    plt.subplots_adjust(left=.1, bottom=.18, right=.95, top=.9)

    if filename:
        plt.savefig(filename.replace(' ', '_')+'.png')
    else:
        plt.savefig('country_of_pub_for_'+corpus_name.replace(' ', '_')+'.png')
Exemplo n.º 13
0
    def subcorpus(self, metadata_field, field_value):
        """
        Returns a new Corpus object that contains only documents with a given field_value for metadata_field

        :param metadata_field: metadata field to search
        :param field_value: search term
        :return: Corpus object

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> path = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> csvpath = TEST_DATA_PATH / 'sample_novels' / 'sample_novels.csv'
        >>> corp = Corpus(path, csv_path=csvpath)
        >>> female_corpus = corp.subcorpus('author_gender','female')
        >>> len(female_corpus)
        39
        >>> female_corpus.documents[0].title
        'The Indiscreet Letter'

        >>> male_corpus = corp.subcorpus('author_gender','male')
        >>> len(male_corpus)
        59
        >>> male_corpus.documents[0].title
        'Lisbeth Longfrock'

        >>> eighteen_fifty_corpus = corp.subcorpus('date','1850')
        >>> len(eighteen_fifty_corpus)
        1
        >>> eighteen_fifty_corpus.documents[0].title
        'The Scarlet Letter'

        >>> jane_austen_corpus = corp.subcorpus('author','Austen, Jane')
        >>> len(jane_austen_corpus)
        2
        >>> jane_austen_corpus.documents[0].title
        'Emma'

        >>> england_corpus = corp.subcorpus('country_publication','England')
        >>> len(england_corpus)
        51
        >>> england_corpus.documents[0].title
        'Flatland'

        """

        if metadata_field not in self.metadata_fields:
            raise MissingMetadataError([metadata_field])

        corpus_copy = self.clone()
        corpus_copy.documents = []

        # adds documents to corpus_copy
        if metadata_field == 'date':
            for this_document in self.documents:
                if this_document.date == int(field_value):
                    corpus_copy.documents.append(this_document)
        else:
            for this_document in self.documents:
                try:
                    this_value = getattr(this_document, metadata_field, None)
                    if this_value is not None and this_value.lower(
                    ) == field_value.lower():
                        corpus_copy.documents.append(this_document)
                except AttributeError:
                    continue

        return corpus_copy