示例#1
0
def overall_analyses():

    for dataset_name in ['journals', 'dissertations']:
        for analysis_type in ['topics', 'terms']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            # Run the divergence analysis
            div = DivergenceAnalysis(d,
                                     c1,
                                     c2,
                                     sub_corpus1_name='women',
                                     sub_corpus2_name='men',
                                     analysis_type=analysis_type,
                                     sort_by='dunning')
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=12)
示例#2
0
def draw_heatmap():

    dataset = JournalsDataset()
    dataset.filter(author_gender='male')

    print(len(dataset))

    topic_selector = [f'topic.{i}' for i in range(1, 91)]
    topic_df = dataset.df[topic_selector]
    topic_id_to_name = {
        f'topic.{i}': dataset.topics[i]['name']
        for i in range(1, 91)
    }
    topic_df = topic_df.rename(columns=topic_id_to_name)

    correlations = topic_df.corr()

    for i in range(90):
        correlations.iat[i, i] = 0.0

    sns.clustermap(
        correlations,
        figsize=(20, 20),
        row_cluster=True,
        col_cluster=True,
        cmap='vlag',
        vmin=-0.25,
        vmax=0.25,
        method='ward',
        xticklabels=[dataset.topics[i]['name'] for i in range(1, 91)],
        yticklabels=[dataset.topics[i]['name'] for i in range(1, 91)])
    plt.show()
def get_viz_data_for_gender_broadening():

    d = JournalsDataset()
    d.topic_score_filter(topic_id=61, min_percentile_score=90)
    get_individual_topic_viz_data(d=d, topic_name='gender_top_decile',
                                  terms=['work', 'family', 'percent', 'medical', 'age', 'table',
                                         'married',
                                         'gender', 'white', 'black', 'race',
                                         'war', 'nation', 'colonial',
                                         'african', 'british'
                                         ])
示例#4
0
def get_distinctive_terms_for_correlated_topics(topic_id,
                                                correlated_topics_list):

    d = JournalsDataset()
    d.topic_score_filter(topic_id=topic_id, min_percentile_score=95)

    for cor_topic_id in correlated_topics_list:
        c = d.copy().topic_score_filter(topic_id=cor_topic_id,
                                        min_percentile_score=95)
        print(cor_topic_id, len(c.df))

        div = DivergenceAnalysis(d, d, c, analysis_type='terms')
        div.run_divergence_analysis()
示例#5
0
def show_male_female_publications_over_time(dataset='journals'):
    """
    Quick visualization of number of articles by men and women

    :return:
    """

    if dataset == 'journals':
        d = JournalsDataset()
    else:
        d = DissertationDataset()
        d.filter(start_year=1980)
    male_counter = Counter()
    female_counter = Counter()

    for _, row in d.df.iterrows():

        if row.m_author_genders == 'male':
            male_counter[row.m_year] += 1
        if row.m_author_genders == 'female':
            female_counter[row.m_year] += 1

    male_arr = []
    female_arr = []
    for year in range(d.start_year, d.end_year + 1):
        male_arr.append(male_counter[year])
        female_arr.append(female_counter[year])

    rolling_female = np.array(
        pd.DataFrame(female_arr).rolling(center=True,
                                         window=5).mean()[0].tolist()[2:-5])
    rolling_male = np.array(
        pd.DataFrame(male_arr).rolling(center=True,
                                       window=5).mean()[0].tolist()[2:-5])

    x = [i for i in range(d.start_year, d.end_year + 1)][2:-5]

    plt.figure(figsize=(6, 6))
    plt.plot(x, rolling_female / (rolling_female + rolling_male), color='blue')
    # plt.plot(x, rolling_male, color='red')

    plt.title('Articles by men (blue) and women (red)')

    plt.savefig(
        Path(BASE_PATH, 'visualizations', 'dataset_summaries',
             'male_female_articles.png'))
    plt.show()

    return rolling_male, rolling_female, x
示例#6
0
def plot_bechdel(term='she', dataset=None):

    if not dataset:
        dataset = JournalsDataset()
        dataset.get_vocabulary_and_document_term_matrix(vocabulary=[term],
                                                        use_frequencies=True,
                                                        store_in_df=True)
    df = dataset.df
    male_data = []
    female_data = []

    for year in range(dataset.start_year, dataset.end_year + 1):
        print(year)
        male_articles_in_year = df[(df.m_year == year)
                                   & (df.m_author_genders == 'male')]
        female_articles_in_year = df[(df.m_year == year)
                                     & (df.m_author_genders == 'female')]
        count_male_term = len(
            male_articles_in_year[male_articles_in_year[term] > 0])
        count_female_term = len(
            female_articles_in_year[female_articles_in_year[term] > 0])
        male_data.append(count_male_term / len(male_articles_in_year) +
                         0.0000001)
        female_data.append(count_female_term / len(female_articles_in_year) +
                           0.0000001)

    fig = plt.figure(figsize=(6, 6))
    gs = gridspec.GridSpec(nrows=1, ncols=1, figure=fig)
    ax = fig.add_subplot(gs[0, 0])

    x = [i for i in range(dataset.start_year + 2, dataset.end_year - 4)]
    print(x)
    rolling_mean_male = pd.DataFrame(male_data).rolling(
        center=True, window=5).mean()[0].tolist()[2:-5]
    rolling_mean_female = pd.DataFrame(female_data).rolling(
        center=True, window=5).mean()[0].tolist()[2:-5]

    ax.plot(x, rolling_mean_male, color='blue')
    ax.plot(x, rolling_mean_female, color='red')

    ax.set(ylim=(0, 1))
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))

    plt.title(f'Percentage of articles using the word "{term}" at least once')
    plt.savefig(
        Path(BASE_PATH, 'visualizations', 'bechdel', f'bechdel_{term}.png'))
    plt.show()

    return rolling_mean_male, rolling_mean_female, x
示例#7
0
def draw_journal_and_dissertation_overview():

    for use_absolute_weights in [True, False]:
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
                dataset.filter(start_year=1951, end_year=2010)
                title = 'Journals, 1950-2010'
            else:
                dataset = DissertationDataset()
                dataset.filter(start_year=1980, end_year=2010)
                title = 'Dissertations, 1980-2010'

            filename = f'topic_scatter_{dataset_name}.png'
            if use_absolute_weights:
                filename = f'topic_scatter_{dataset_name}_absolute_weights.png'
                title += ', Absolute Weights'

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=filename,
                show_plot=True,
                title=title,
                use_absolute_weights=use_absolute_weights)
示例#8
0
def get_percentile_data(topic_id):

    d = JournalsDataset()
    men = d.copy().filter(author_gender='male')
    women = d.copy().filter(author_gender='female')
    years = [i for i in range(d.start_year, d.end_year + 1)]
    topic_weight_ranges = [(0, 0.001), (0.001, 0.01), (0.01, 0.1), (0.1, 1)]

    output_df = pd.DataFrame()

    for weight_min, weight_max in topic_weight_ranges:
        data_men = []
        data_women = []
        for year in years:
            year_men = men.copy().filter(start_year=year, end_year=year)
            articles_men = len(year_men)
            year_women = women.copy().filter(start_year=year, end_year=year)
            articles_women = len(year_women)

            men_articles_in_weight = len(
                year_men.topic_score_filter(topic_id=topic_id,
                                            min_topic_weight=weight_min,
                                            max_topic_weight=weight_max))
            women_articles_in_weight = len(
                year_women.topic_score_filter(topic_id=topic_id,
                                              min_topic_weight=weight_min,
                                              max_topic_weight=weight_max))

            data_men.append(men_articles_in_weight / articles_men)
            data_women.append(women_articles_in_weight / articles_women)

        data_men_rolling = pd.DataFrame(data_men).rolling(
            center=True, window=5).mean()[0].tolist()[2:-5]
        data_women_rolling = pd.DataFrame(data_women).rolling(
            center=True, window=5).mean()[0].tolist()[2:-5]

        output_df[f'men_{weight_min}-{weight_max}'] = data_men_rolling
        output_df[f'women_{weight_min}-{weight_max}'] = data_women_rolling

    output_df['years'] = years[2:-5]
    topic_name = d.topics[topic_id]['name'].replace(' ', '_')
    output_df.to_csv(
        Path(BASE_PATH, 'visualizations', 'plotly_data',
             f'{topic_name}_percentiles.csv'))
示例#9
0
def draw_set_of_gender_frequency_scatterplots():
    """
    Draws a set of three gender frequency scatterplots either using or not using the
    dataset with 500 documents per gender and five year period or not.

    - a large labeling copy that makes it easy to distinguish all of the labels
    - a transparent base layer for labeling only a subset of topics
    - a small version with all labels (though they look jumbled).

    :return:
    """

    for (name, use_equal_samples_dataset) in [('_eq_samples_dataset', True),
                                              ('', False)]:

        if use_equal_samples_dataset:
            dataset = JournalsDataset(
                use_equal_samples_dataset=use_equal_samples_dataset)
        else:
            dataset = JournalsDataset()

        draw_gender_frequency_scatterplot(
            dataset,
            figsize=36,
            show_labels=True,
            transparent_image=False,
            filename=f'gfs_labeling_copy{name}.png')

        draw_gender_frequency_scatterplot(
            dataset,
            figsize=12,
            show_labels=False,
            transparent_image=True,
            filename=f'gfs_transparent_base_layer{name}.png')

        draw_gender_frequency_scatterplot(
            dataset,
            figsize=12,
            show_labels=True,
            transparent_image=False,
            filename=f'gfs_standard_all_labels{name}.png',
        )

        break
示例#10
0
def ginis():

    d_all = JournalsDataset()
    # d_all.topic_score_filter(topic_id=61, min_percentile_score=90)

    ginis_gender = []
    ginis_all_topics = []
    gender_topics = []
    for start_year in range(d_all.start_year, d_all.end_year + 1):
        year_d = d_all.copy().filter(start_year=start_year,
                                     end_year=start_year)

        ginis_gender.append(gini(year_d, topic_id=61))
        gender_topics.append(year_d.df['topic.61'].mean())

        ginis_all_topics_year = []
        for i in range(1, 91):
            if d_all.topics[i]['name'].startswith('Noise'):
                continue
            ginis_all_topics_year.append(gini(year_d, topic_id=i))

        ginis_all_topics.append(np.array(ginis_all_topics_year).mean())

    gini_gender_rolling = pd.DataFrame(ginis_gender).rolling(
        center=True, window=7).mean()[0].tolist()[3:-5]
    gini_all_topics_rolling = pd.DataFrame(ginis_all_topics).rolling(
        center=True, window=7).mean()[0].tolist()[3:-5]
    gender_rolling = pd.DataFrame(gender_topics).rolling(
        center=True, window=7).mean()[0].tolist()[3:-5]
    years = [i for i in range(d_all.start_year, d_all.end_year + 1)][3:-5]

    plt.plot(years, gini_all_topics_rolling)
    plt.plot(years, gini_gender_rolling)
    plt.title("Gini values for women and gender in all articles")
    plt.show()

    df = pd.DataFrame()
    df['years'] = years
    df['gini_gender'] = gini_gender_rolling
    df['gini_all_topics'] = gini_all_topics_rolling
    df['gender_rolling'] = gender_rolling
    df.to_csv(Path(BASE_PATH, 'visualizations', 'plotly_data', 'gini.csv'))
示例#11
0
    def get_default_vocabulary():
        """
        Loads the 1000 most frequent non-stop word terms in the journal dataset

        :return: list
        """

        vocabulary_path = Path(BASE_PATH, 'data', 'journal_csv', 'default_vocabulary.pickle')

        if not vocabulary_path.exists():
            print("generating new default vocabulary with 1000 non-stop word terms.")
            d = JournalsDataset()
            _, vocabulary = d.get_vocabulary_and_document_term_matrix(
                max_features=999, exclude_stop_words=True
            )
            vocabulary.append('gay')
            with open(vocabulary_path, 'wb') as outfile:
                pickle.dump(vocabulary, outfile)

        with open(vocabulary_path, 'rb') as infile:
            return pickle.load(infile)
示例#12
0
def get_data():

    d = JournalsDataset()
    d.get_vocabulary_and_document_term_matrix(vocabulary=['women', 'gender'],
                                              use_frequencies=True,
                                              store_in_df=True)

    from gender_history.visualizations.bechdel_plot import plot_bechdel

    b_women_male, b_women_female, x1 = plot_bechdel(term='women', dataset=d)
    b_gender_male, b_gender_female, _ = plot_bechdel(term='gender', dataset=d)

    f_gender_male, f_gender_female, x = ngram_plot('gender')

    f_women_male, f_women_female, _ = ngram_plot('women')
    f_topic_male, f_topic_female, _ = ngram_plot('topic.61')

    with open(Path(BASE_PATH, 'visualizations', 'gender_women.csv'),
              'w') as outfile:
        csv_writer = csv.writer(outfile)
        csv_writer.writerow([
            'topic_male',
            'topic_female',
            'women_male',
            'women_female',
            'gender_male',
            'gender_female',
            'bechdel_women_male',
            'bechdel_women_female',
            'bechdel_gender_male',
            'bechdel_gender_female',
            'x',
        ])
        for i in range(len(f_gender_male)):
            csv_writer.writerow([
                f_topic_male[i], f_topic_female[i], f_women_male[i],
                f_women_female[i], f_gender_male[i], f_gender_female[i],
                b_women_male[i], b_women_female[i], b_gender_male[i],
                b_gender_female[i], x[i]
            ])
示例#13
0
def analysis_nazi_history():

    dataset_name = 'journals'
    analysis_type = 'topics'

    d = JournalsDataset()

    compare_to_overall_weights = True

    # retain only the articles scoring in the top 5% for topic 29 (Nazi Germany)
    d.topic_score_filter(29, min_percentile_score=95)

    # Create two sub-datasets, one for female authors and one for male authors
    c1 = d.copy().filter(author_gender='female')
    c2 = d.copy().filter(author_gender='male')

    div = DivergenceAnalysis(
        d,
        c1,
        c2,
        sub_corpus1_name='women',
        sub_corpus2_name='men',
        analysis_type=analysis_type,
        sort_by='dunning',
        compare_to_overall_weights=compare_to_overall_weights)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)

    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=5)
示例#14
0
def ngram_plot(token):

    d = JournalsDataset()

    male = d.copy().filter(author_gender='male')
    female = d.copy().filter(author_gender='female')

    if not token.startswith('topic.'):
        male.get_vocabulary_and_document_term_matrix(vocabulary=[token],
                                                     store_in_df=True,
                                                     use_frequencies=True)
        female.get_vocabulary_and_document_term_matrix(vocabulary=[token],
                                                       store_in_df=True,
                                                       use_frequencies=True)

    male_data = []
    female_data = []

    for year in range(d.start_year, d.end_year + 1):
        male_data.append(male.df[male.df.m_year == year][token].mean())
        female_data.append(female.df[female.df.m_year == year][token].mean())

    rolling_male = pd.DataFrame(male_data).rolling(
        center=True, window=7).mean()[0].tolist()[2:-5]
    rolling_female = pd.DataFrame(female_data).rolling(
        center=True, window=7).mean()[0].tolist()[2:-5]
    x = [i for i in range(d.start_year, d.end_year + 1)][2:-5]

    plt.figure(figsize=(6, 6))
    plt.plot(x, rolling_male, color='blue')
    plt.plot(x, rolling_female, color='red')

    plt.title(f'{token} in articles by men (blue) and women (red)')

    # plt.savefig(Path(BASE_PATH, 'visualizations', 'dataset_summaries', 'male_female_articles.png'))
    plt.show()

    return rolling_male, rolling_female, x
示例#15
0
def analysis_sexuality_time_and_gender():

    d = JournalsDataset()
    # d.filter(term_filter={'term': '[fF]reud', 'min_count': 2})
    c1 = d.copy().filter(author_gender='male')
    c2 = d.copy().filter(author_gender='female')

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='men early',
                             sub_corpus2_name='women late',
                             analysis_type='terms',
                             sort_by='dunning',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=True)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=500)

    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=10)
示例#16
0
def analysis_gender_time():

    d = JournalsDataset()
    d.topic_score_filter(topic_id=61, min_percentile_score=90)
    c1 = d.copy().filter(start_year=1970, end_year=1989)
    c2 = d.copy().filter(start_year=1990, end_year=2009)

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='1970-1989',
                             sub_corpus2_name='1990-2009',
                             analysis_type='terms',
                             sort_by='dunning',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=True)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)

    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=5)
示例#17
0
def get_number_of_male_and_female_authored_articles_by_year(
        start_year, end_year, dataset_name) -> (list, list, list):
    """
    returns number of male and female authored articles and list of years for male/female subplot

    male + female data add up to 1, i.e. they show percentages of articles written by men and
    women

    :param start_year:
    :param end_year:
    :return:
    """

    if dataset_name == 'journals':
        d = JournalsDataset()
    else:
        d = DissertationDataset()

    male_data = [0] * (d.end_year - d.start_year + 1)
    female_data = [0] * (d.end_year - d.start_year + 1)

    for _, row in d.df.iterrows():
        if row.m_author_genders == 'male':
            male_data[row.m_year - d.start_year] += 1
        elif row.m_author_genders == 'female':
            female_data[row.m_year - d.start_year] += 1
        else:
            pass

    rolling_mean_male = pd.DataFrame(male_data).rolling(
        center=True, window=5, min_periods=1).mean()[0].tolist()
    rolling_mean_female = pd.DataFrame(female_data).rolling(
        center=True, window=5, min_periods=1).mean()[0].tolist()

    male_data = np.array(rolling_mean_male[start_year - d.start_year:end_year +
                                           1 - d.start_year])
    female_data = np.array(
        rolling_mean_female[start_year - d.start_year:end_year + 1 -
                            d.start_year])

    totals = male_data + female_data
    male_data = male_data / totals
    female_data = female_data / totals

    years = [i for i in range(start_year, end_year + 1)]

    assert len(male_data) == len(female_data) == len(years)

    return male_data, female_data, years
示例#18
0
def draw_all_years():

    for start_year in range(1960, 2010, 10):
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
            else:
                dataset = DissertationDataset()
                if start_year < 1980:
                    continue

            dataset.filter(start_year=start_year, end_year=start_year + 9)

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=
                f'topic_scatter_{dataset_name}_{start_year}-{start_year+9}.png',
                show_plot=True,
                title=f'{dataset_name.capitalize()}, {start_year}s')
示例#19
0
def generate_lorenz_curves():

    d = JournalsDataset()
    d_topic = d.copy().topic_score_filter(topic_id=61, min_percentile_score=90)

    for start_year in [1960, 1970, 1980, 1990, 2000]:
        year_d = d.copy().filter(start_year=start_year,
                                 end_year=start_year + 9)
        year_topic_d = d_topic.copy().filter(start_year=start_year,
                                             end_year=start_year + 9)

        lorenz_all = get_lorenz_data(year_d)
        gini_all = np.round(gini(year_d), 3)
        lorenz_topic = get_lorenz_data(year_topic_d)
        gini_topic = np.round(gini(year_topic_d), 3)

        fig, ax = plt.subplots(figsize=[6, 6])
        ## scatter plot of Lorenz curve
        ax.scatter(np.arange(lorenz_all.size) / (lorenz_all.size - 1),
                   lorenz_all,
                   marker='x',
                   color='darkgreen',
                   s=100)
        ax.scatter(np.arange(lorenz_topic.size) / (lorenz_topic.size - 1),
                   lorenz_topic,
                   marker='+',
                   color='blue',
                   s=100)
        ## line plot of equality
        ax.plot([0, 1], [0, 1], color='k')
        ax.set_title(
            f'Lorenz Curve for the Women and Gender Topic in the {start_year}s\n'
            f'Green: all articles. Blue: top decile for women and gender.\n'
            f'Gini coefficient.  all articles = {gini_all}. top decile = {gini_topic}'
        )
        plt.show()
示例#20
0
    def get_default_vocabulary(self, no_terms=1000):
        """
        Loads a default vocabulary of no_terms, i.e. a vocabulary generated with a journal dataset
        without any filters

        :param no_terms:
        :return:
        """

        vocabulary_path = Path(BASE_PATH, 'data', 'dtms',
                               f'vocabulary_{no_terms}.pickle')
        if vocabulary_path.exists():
            with open(vocabulary_path, 'rb') as infile:
                return pickle.load(infile)

        else:
            print(f"Generating new standard vocabulary with {no_terms} terms.")
            from gender_history.datasets.dataset_journals import JournalsDataset
            jd = JournalsDataset()
            _, vocabulary = jd.get_vocabulary_and_document_term_matrix(
                max_features=no_terms, )
            with open(vocabulary_path, 'wb') as outfile:
                pickle.dump(vocabulary, outfile)
            return vocabulary
示例#21
0
def plot_all_topics_and_general_approaches():

    dataset = JournalsDataset(use_equal_samples_dataset=True)

    for column in dataset.df.columns:
        print(column)

        if column.startswith('gen_approach'):
            topic_percentile_plot(
                dataset,
                selection_column=column,
                selection_name=f'General Approach {column[13:]}',
                filename=f'{column.replace(" ", "_")}.png',
                show_plot=False)
        if column.startswith('topic.'):
            topic_no = int(column[6:])
            topic_name = dataset.topics[topic_no]['name']
            topic_percentile_plot(
                dataset,
                selection_column=column,
                selection_name=f'Topic {topic_name}',
                filename=f'{topic_no}_{topic_name.replace(" ", "_")}.png',
                show_plot=False)
示例#22
0
def get_1percent_ratios():

    dataset = JournalsDataset()
    # dataset.filter(start_year=1980)
    df = dataset.df
    results = {}
    for column in df.columns:
        if column.startswith('topic') or column.startswith('gen_a'):
            top1p = df[df[column] >= df[column].quantile(0.99)]
            male = len(top1p[top1p.m_author_genders == 'male'])
            female = len(top1p[top1p.m_author_genders == 'female'])

            if column.startswith('topic'):
                name = dataset.topics[int(column[6:])]['name']
            else:
                name = column

            print(column, name, female / male, male, female)

            results[name] = female / male * 7506 / 2016

    for x in sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(x)
示例#23
0
def analysis_military_history():

    for dataset_name in ['journals']:
        for analysis_type in ['topics']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            if analysis_type == 'topics':
                compare_to_overall_weights = True
            else:
                compare_to_overall_weights = False

            # retain only the articles scoring in the top 10% for topic 31 (military history)
            d.topic_score_filter(31, min_percentile_score=90)

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            div = DivergenceAnalysis(
                d,
                c1,
                c2,
                sub_corpus1_name='women',
                sub_corpus2_name='men',
                analysis_type=analysis_type,
                sort_by='dunning',
                compare_to_overall_weights=compare_to_overall_weights)
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)

            div.print_articles_for_top_topics(top_terms_or_topics=10,
                                              articles_per_term_or_topic=5)
def get_individual_topic_viz_data(terms, topic_name, d: JournalsDataset=None):

    if not d:
        d = JournalsDataset()
    d.get_vocabulary_and_document_term_matrix(vocabulary=terms, use_frequencies=True,
                                              store_in_df=True)
    men = d.copy().filter(author_gender='male')
    women = d.copy().filter(author_gender='female')

    data = defaultdict(list)

    for year in range(d.start_year, d.end_year + 1):


        men_year = men.copy().filter(start_year=year, end_year=year)
        women_year = women.copy().filter(start_year=year, end_year=year)
        all_year = d.copy().filter(start_year=year, end_year=year)


        print(year, len(all_year))

        for term in terms:
            data[f'men_{term}'].append(men_year.df[term].mean())
            data[f'women_{term}'].append(women_year.df[term].mean())
            data[f'all_{term}'].append(all_year.df[term].mean())

    avg_data = {}
    for term in terms:
        # avg_data[f'men_{term}'] = pd.DataFrame(data[f'men_{term}']).rolling(center=True,
        #                                                         window=5).mean()[0].tolist()[2:-5]
        # avg_data[f'women_{term}'] = pd.DataFrame(data[f'women_{term}']).rolling(center=True,
        #                                                         window=5).mean()[0].tolist()[2:-5]
        avg_data[f'all_{term}'] = pd.DataFrame(data[f'all_{term}']).rolling(center=True,
                                                                window=5).mean()[0].tolist()[2:-5]

    avg_data['year'] = [int(i) for i in range(d.start_year, d.end_year + 1)][2:-5]


    df = pd.DataFrame.from_dict(avg_data, orient='index').transpose()
    df['year'] = df['year'].astype(int)

    df.to_csv(Path(BASE_PATH, 'visualizations', 'plotly_data', f'{topic_name}_topic_data.csv'))
示例#25
0
def analysis_term_gender():

    d = JournalsDataset()
    d.filter(term_filter={'term': 'gender', 'min_count': 10})
    c1 = d.copy().filter(author_gender='male')
    c2 = d.copy().filter(author_gender='female')

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='male',
                             sub_corpus2_name='female',
                             analysis_type='terms',
                             sort_by='frequency_score',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=False)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)
示例#26
0
    # # dataset.filter(start_year=2000)
    #
    # dataset = JournalsDataset()
    '''
    # post 2000
    1534 articles by men, 1040 mention womaen   68%
    751 by women, 633 mention womaen            84%
    
    # before 1970
    1633 by men, 526 mention womaen             32%
    140 by women, 63 mention womaen             45%
    


    '''
    dataset = JournalsDataset()
    # dataset.get_vocabulary_and_document_term_matrix(max_features=10000, use_frequencies=True,
    #                                                 store_in_df=True)
    # dataset.df['womaen'] = dataset.df['women'] + dataset.df['woman']

    # embed()

    # topic_percentile_plot(dataset=dataset, selection_column="gender",
    #                       selection_name='term: gender',
    #                       filename='term_gender.png')
    topic_percentile_plot(dataset=dataset,
                          selection_column="topic.71",
                          selection_name='Women and Gender',
                          filename='61_percentiles.png')
    # topic_percentile_plot(dataset=dataset, selection_column="gender",
    #                       selection_name='term: gender',
示例#27
0
def draw_scatterplots_of_journals():
    """
    Creates scatter plots for all journals as well as (JAH + AHR) and all journals
    minus History and Theory

    :return:
    """

    valid_journals = {
        'Comparative Studies in Society and History',
        'The Journal of Modern History', 'The Journal of American History',
        'Journal of World History', 'The Journal of Interdisciplinary History',
        'Journal of Social History', 'The American Historical Review',
        'Reviews in American History', 'History and Theory', 'Ethnohistory'
    }

    for journal in valid_journals:

        dataset = JournalsDataset()
        dataset.filter_by_journal([journal])
        draw_gender_frequency_scatterplot(
            dataset,
            figsize=36,
            show_labels=True,
            transparent_image=False,
            filename=f'single_journal{journal.replace(" ", "_")}.png',
            dynamic_y_coords=True)

    # all except history and theory
    valid_journals.remove('History and Theory')
    dataset = JournalsDataset()
    dataset.filter_by_journal(list(valid_journals))
    draw_gender_frequency_scatterplot(
        dataset,
        figsize=36,
        show_labels=True,
        transparent_image=False,
        filename=f'all_except_history_and_theory.png',
        dynamic_y_coords=True)

    # AHR and JAH
    dataset = JournalsDataset()
    dataset.filter_by_journal(
        ['The Journal of American History', 'The American Historical Review'])
    draw_gender_frequency_scatterplot(dataset,
                                      figsize=36,
                                      show_labels=True,
                                      transparent_image=False,
                                      filename=f'ahr_and_jah.png',
                                      dynamic_y_coords=True)
示例#28
0
def load_master_viz_data(mode, smoothing=5):
    """
    For every term or topic in the token list, this function returns a dict consisting of:
    - year          (list of years in the dataset)
    - freq_score    (list of yearly frequency scores)
    - freq          (list of yearly frequency of the term)
    - mean_freq_score, mean_freq, freq_score_range (floats)

    :param dataset:
    :param token_list:
    :param smoothing:
    :return:
    """
    if not mode in {'terms', 'topics'}:
        raise ValueError('mode has to be either "terms" or "topics".')

    master_viz_path = Path(BASE_PATH, 'data', 'dtms',
                           f'viz_data_{mode}.pickle')

    if master_viz_path.exists():
        with open(master_viz_path, 'rb') as infile:
            return pickle.load(infile)

    print(f"Creating new master viz dataset for {mode}.")
    dataset = JournalsDataset()
    if mode == 'terms':
        _, master_vocabulary = dataset.get_vocabulary_and_document_term_matrix(
            max_features=100000)
        # load text info and turn it into term frequencies
        dataset.get_vocabulary_and_document_term_matrix(
            vocabulary=master_vocabulary,
            use_frequencies=True,
            store_in_df=True)
    else:
        master_vocabulary = [f'topic.{i}' for i in range(1, 91)]
        for column in dataset.df.columns:
            if column.startswith('gen_approach_'):
                master_vocabulary.append(column)

    data = {}
    for t in master_vocabulary:
        data[t] = defaultdict(list)

    df = dataset.df

    # create time slices for every year
    for idx, year in enumerate(range(dataset.start_year,
                                     dataset.end_year + 1)):
        print(year)
        time_slice = df[(df.m_year >= year - smoothing)
                        & (df.m_year <= year + smoothing)]
        time_slice_female = time_slice[time_slice.m_author_genders == 'female']
        time_slice_male = time_slice[time_slice.m_author_genders == 'male']

        for t in master_vocabulary:
            freq_both = time_slice[t].mean()
            freq_female = time_slice_female[t].mean()
            freq_male = time_slice_male[t].mean()

            # if a term doesn't appear, it is neutral
            if (freq_male + freq_female) == 0:
                freq_score = 0.5
            else:
                freq_score = freq_female / (freq_female + freq_male)

            data[t]['year'].append(year)
            data[t]['freq_score'].append(freq_score)
            data[t]['freq'].append(freq_both)
            data[t]['freq_male'].append(freq_male)
            data[t]['freq_female'].append(freq_female)

    for t in master_vocabulary:
        data[t]['mean_freq_score'] = np.mean(data[t]['freq_score'])
        data[t]['mean_freq'] = np.mean(data[t]['freq'])
        data[t]['freq_score_range'] = max(data[t]['freq_score']) - min(
            data[t]['freq_score'])

    with open(master_viz_path, 'wb') as outfile:
        pickle.dump(data, outfile)
    return data
示例#29
0
        center=True, window=5).mean()[0].tolist()[2:-5]
    rolling_mean_female = pd.DataFrame(female_data).rolling(
        center=True, window=5).mean()[0].tolist()[2:-5]

    ax.plot(x, rolling_mean_male, color='blue')
    ax.plot(x, rolling_mean_female, color='red')

    ax.set(ylim=(0, 1))
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))

    plt.title(f'Percentage of articles using the word "{term}" at least once')
    plt.savefig(
        Path(BASE_PATH, 'visualizations', 'bechdel', f'bechdel_{term}.png'))
    plt.show()

    return rolling_mean_male, rolling_mean_female, x


if __name__ == '__main__':

    dataset = JournalsDataset()
    dataset.get_vocabulary_and_document_term_matrix(max_features=10000,
                                                    use_frequencies=True,
                                                    store_in_df=True)

    plot_bechdel(term='she', dataset=dataset)
    plot_bechdel(term='women', dataset=dataset)
    plot_bechdel(term='gender', dataset=dataset)
    # plot_bechdel(term='sex', dataset=dataset)
    # plot_bechdel(term='sexuality', dataset=dataset)
def plot_topic_frequency_with_6_terms(topic_id, term_list, store_plot=True):
    """
    Plots the topic weight over time with the frequency charts of 6 key terms
    The term_list can be selected from term prob, frex, and divergence analysis.

    :param topic_id: int
    :param term_list:
    :param store_plot:
    :return:
    """

    dataset = JournalsDataset()
    master_viz_data = load_master_viz_data(mode='terms')

    fig = plt.figure(figsize=(5 * 12, 2 * 12))
    gs = gridspec.GridSpec(nrows=2,
                           ncols=6,
                           figure=fig,
                           wspace=0.2, hspace=0.2,
                           # final 0.5 is to draw the colorbar into
                           width_ratios=[5, 5, 5, 5, 5, 0.5]
                           )

    # draw the topic weight plot into the first 2x2 chart section
    ax_topic = fig.add_subplot(gs[0:2, 0:2])
    create_ngram_plot(subplot_ax=ax_topic,
                      term_or_topic_list=[f'topic.{topic_id}'],
                      plot_title="Overall Topic Weight",
                      scale_factor=2)
    # slightly reduce title font size and padding and add y axis label
    ax_topic.set_ylabel('Mean topic weight', fontsize=28)
    ax_topic.set_title(label='Overall Topic Weight', weight='bold', fontsize=32, pad=30)

    # add the six terms
    for idx, term in enumerate(term_list):
        row = idx // 3
        col = idx % 3 + 2
        print(row, col, term)
        ax = fig.add_subplot(gs[row, col])
        create_ngram_plot(
            subplot_ax=ax,
            term_or_topic_list=[term],
            plot_title=term.capitalize(),
            master_viz_data=master_viz_data
        )

    # Draw colorbar
    lc = LineCollection([], cmap='coolwarm', norm=plt.Normalize(0.0, 1.0))
    cbar_ax = fig.add_subplot(gs[:, 5])
    cbar = fig.colorbar(lc,
                        cax=cbar_ax,
                        ticks = [0.025,  0.975],
                        fraction=0.03)
    cbar.ax.set_yticklabels(['Only men \nuse a term',
                             'Only women \nuse a term'])
    cbar.ax.tick_params(labelsize=28)

    # Draw title
    title = f'{dataset.topics[topic_id]["name"]}: Overall Weight and Key Terms'
    fig.suptitle(title, fontsize=60, weight='bold')

    # Add y axis labels for the first term plots
    fig.get_axes()[1].set_ylabel('Mean term frequency', fontsize=14)
    fig.get_axes()[4].set_ylabel('Mean term frequency', fontsize=14)

    if store_plot:
        filename = f'{topic_id}_{dataset.topics[topic_id]["name"]}.png'
        plt.savefig(Path(BASE_PATH, 'visualizations', 'topic_frequency_plots', filename))


    plt.show()