def overall_analyses(): for dataset_name in ['journals', 'dissertations']: for analysis_type in ['topics', 'terms']: print('\n\n\n', dataset_name, analysis_type) if dataset_name == 'journals': d = JournalsDataset() else: d = DissertationDataset() # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning') div.run_divergence_analysis(number_of_terms_or_topics_to_print=12)
def draw_heatmap(): dataset = JournalsDataset() dataset.filter(author_gender='male') print(len(dataset)) topic_selector = [f'topic.{i}' for i in range(1, 91)] topic_df = dataset.df[topic_selector] topic_id_to_name = { f'topic.{i}': dataset.topics[i]['name'] for i in range(1, 91) } topic_df = topic_df.rename(columns=topic_id_to_name) correlations = topic_df.corr() for i in range(90): correlations.iat[i, i] = 0.0 sns.clustermap( correlations, figsize=(20, 20), row_cluster=True, col_cluster=True, cmap='vlag', vmin=-0.25, vmax=0.25, method='ward', xticklabels=[dataset.topics[i]['name'] for i in range(1, 91)], yticklabels=[dataset.topics[i]['name'] for i in range(1, 91)]) plt.show()
def get_viz_data_for_gender_broadening(): d = JournalsDataset() d.topic_score_filter(topic_id=61, min_percentile_score=90) get_individual_topic_viz_data(d=d, topic_name='gender_top_decile', terms=['work', 'family', 'percent', 'medical', 'age', 'table', 'married', 'gender', 'white', 'black', 'race', 'war', 'nation', 'colonial', 'african', 'british' ])
def get_distinctive_terms_for_correlated_topics(topic_id, correlated_topics_list): d = JournalsDataset() d.topic_score_filter(topic_id=topic_id, min_percentile_score=95) for cor_topic_id in correlated_topics_list: c = d.copy().topic_score_filter(topic_id=cor_topic_id, min_percentile_score=95) print(cor_topic_id, len(c.df)) div = DivergenceAnalysis(d, d, c, analysis_type='terms') div.run_divergence_analysis()
def show_male_female_publications_over_time(dataset='journals'): """ Quick visualization of number of articles by men and women :return: """ if dataset == 'journals': d = JournalsDataset() else: d = DissertationDataset() d.filter(start_year=1980) male_counter = Counter() female_counter = Counter() for _, row in d.df.iterrows(): if row.m_author_genders == 'male': male_counter[row.m_year] += 1 if row.m_author_genders == 'female': female_counter[row.m_year] += 1 male_arr = [] female_arr = [] for year in range(d.start_year, d.end_year + 1): male_arr.append(male_counter[year]) female_arr.append(female_counter[year]) rolling_female = np.array( pd.DataFrame(female_arr).rolling(center=True, window=5).mean()[0].tolist()[2:-5]) rolling_male = np.array( pd.DataFrame(male_arr).rolling(center=True, window=5).mean()[0].tolist()[2:-5]) x = [i for i in range(d.start_year, d.end_year + 1)][2:-5] plt.figure(figsize=(6, 6)) plt.plot(x, rolling_female / (rolling_female + rolling_male), color='blue') # plt.plot(x, rolling_male, color='red') plt.title('Articles by men (blue) and women (red)') plt.savefig( Path(BASE_PATH, 'visualizations', 'dataset_summaries', 'male_female_articles.png')) plt.show() return rolling_male, rolling_female, x
def plot_bechdel(term='she', dataset=None): if not dataset: dataset = JournalsDataset() dataset.get_vocabulary_and_document_term_matrix(vocabulary=[term], use_frequencies=True, store_in_df=True) df = dataset.df male_data = [] female_data = [] for year in range(dataset.start_year, dataset.end_year + 1): print(year) male_articles_in_year = df[(df.m_year == year) & (df.m_author_genders == 'male')] female_articles_in_year = df[(df.m_year == year) & (df.m_author_genders == 'female')] count_male_term = len( male_articles_in_year[male_articles_in_year[term] > 0]) count_female_term = len( female_articles_in_year[female_articles_in_year[term] > 0]) male_data.append(count_male_term / len(male_articles_in_year) + 0.0000001) female_data.append(count_female_term / len(female_articles_in_year) + 0.0000001) fig = plt.figure(figsize=(6, 6)) gs = gridspec.GridSpec(nrows=1, ncols=1, figure=fig) ax = fig.add_subplot(gs[0, 0]) x = [i for i in range(dataset.start_year + 2, dataset.end_year - 4)] print(x) rolling_mean_male = pd.DataFrame(male_data).rolling( center=True, window=5).mean()[0].tolist()[2:-5] rolling_mean_female = pd.DataFrame(female_data).rolling( center=True, window=5).mean()[0].tolist()[2:-5] ax.plot(x, rolling_mean_male, color='blue') ax.plot(x, rolling_mean_female, color='red') ax.set(ylim=(0, 1)) ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0)) plt.title(f'Percentage of articles using the word "{term}" at least once') plt.savefig( Path(BASE_PATH, 'visualizations', 'bechdel', f'bechdel_{term}.png')) plt.show() return rolling_mean_male, rolling_mean_female, x
def draw_journal_and_dissertation_overview(): for use_absolute_weights in [True, False]: for dataset_name in ['dissertations', 'journals']: if dataset_name == 'journals': dataset = JournalsDataset() dataset.filter(start_year=1951, end_year=2010) title = 'Journals, 1950-2010' else: dataset = DissertationDataset() dataset.filter(start_year=1980, end_year=2010) title = 'Dissertations, 1980-2010' filename = f'topic_scatter_{dataset_name}.png' if use_absolute_weights: filename = f'topic_scatter_{dataset_name}_absolute_weights.png' title += ', Absolute Weights' draw_gender_frequency_scatterplot( dataset, figsize=12, show_labels=True, transparent_image=False, dynamic_y_coords=False, filename=filename, show_plot=True, title=title, use_absolute_weights=use_absolute_weights)
def get_percentile_data(topic_id): d = JournalsDataset() men = d.copy().filter(author_gender='male') women = d.copy().filter(author_gender='female') years = [i for i in range(d.start_year, d.end_year + 1)] topic_weight_ranges = [(0, 0.001), (0.001, 0.01), (0.01, 0.1), (0.1, 1)] output_df = pd.DataFrame() for weight_min, weight_max in topic_weight_ranges: data_men = [] data_women = [] for year in years: year_men = men.copy().filter(start_year=year, end_year=year) articles_men = len(year_men) year_women = women.copy().filter(start_year=year, end_year=year) articles_women = len(year_women) men_articles_in_weight = len( year_men.topic_score_filter(topic_id=topic_id, min_topic_weight=weight_min, max_topic_weight=weight_max)) women_articles_in_weight = len( year_women.topic_score_filter(topic_id=topic_id, min_topic_weight=weight_min, max_topic_weight=weight_max)) data_men.append(men_articles_in_weight / articles_men) data_women.append(women_articles_in_weight / articles_women) data_men_rolling = pd.DataFrame(data_men).rolling( center=True, window=5).mean()[0].tolist()[2:-5] data_women_rolling = pd.DataFrame(data_women).rolling( center=True, window=5).mean()[0].tolist()[2:-5] output_df[f'men_{weight_min}-{weight_max}'] = data_men_rolling output_df[f'women_{weight_min}-{weight_max}'] = data_women_rolling output_df['years'] = years[2:-5] topic_name = d.topics[topic_id]['name'].replace(' ', '_') output_df.to_csv( Path(BASE_PATH, 'visualizations', 'plotly_data', f'{topic_name}_percentiles.csv'))
def draw_set_of_gender_frequency_scatterplots(): """ Draws a set of three gender frequency scatterplots either using or not using the dataset with 500 documents per gender and five year period or not. - a large labeling copy that makes it easy to distinguish all of the labels - a transparent base layer for labeling only a subset of topics - a small version with all labels (though they look jumbled). :return: """ for (name, use_equal_samples_dataset) in [('_eq_samples_dataset', True), ('', False)]: if use_equal_samples_dataset: dataset = JournalsDataset( use_equal_samples_dataset=use_equal_samples_dataset) else: dataset = JournalsDataset() draw_gender_frequency_scatterplot( dataset, figsize=36, show_labels=True, transparent_image=False, filename=f'gfs_labeling_copy{name}.png') draw_gender_frequency_scatterplot( dataset, figsize=12, show_labels=False, transparent_image=True, filename=f'gfs_transparent_base_layer{name}.png') draw_gender_frequency_scatterplot( dataset, figsize=12, show_labels=True, transparent_image=False, filename=f'gfs_standard_all_labels{name}.png', ) break
def ginis(): d_all = JournalsDataset() # d_all.topic_score_filter(topic_id=61, min_percentile_score=90) ginis_gender = [] ginis_all_topics = [] gender_topics = [] for start_year in range(d_all.start_year, d_all.end_year + 1): year_d = d_all.copy().filter(start_year=start_year, end_year=start_year) ginis_gender.append(gini(year_d, topic_id=61)) gender_topics.append(year_d.df['topic.61'].mean()) ginis_all_topics_year = [] for i in range(1, 91): if d_all.topics[i]['name'].startswith('Noise'): continue ginis_all_topics_year.append(gini(year_d, topic_id=i)) ginis_all_topics.append(np.array(ginis_all_topics_year).mean()) gini_gender_rolling = pd.DataFrame(ginis_gender).rolling( center=True, window=7).mean()[0].tolist()[3:-5] gini_all_topics_rolling = pd.DataFrame(ginis_all_topics).rolling( center=True, window=7).mean()[0].tolist()[3:-5] gender_rolling = pd.DataFrame(gender_topics).rolling( center=True, window=7).mean()[0].tolist()[3:-5] years = [i for i in range(d_all.start_year, d_all.end_year + 1)][3:-5] plt.plot(years, gini_all_topics_rolling) plt.plot(years, gini_gender_rolling) plt.title("Gini values for women and gender in all articles") plt.show() df = pd.DataFrame() df['years'] = years df['gini_gender'] = gini_gender_rolling df['gini_all_topics'] = gini_all_topics_rolling df['gender_rolling'] = gender_rolling df.to_csv(Path(BASE_PATH, 'visualizations', 'plotly_data', 'gini.csv'))
def get_default_vocabulary(): """ Loads the 1000 most frequent non-stop word terms in the journal dataset :return: list """ vocabulary_path = Path(BASE_PATH, 'data', 'journal_csv', 'default_vocabulary.pickle') if not vocabulary_path.exists(): print("generating new default vocabulary with 1000 non-stop word terms.") d = JournalsDataset() _, vocabulary = d.get_vocabulary_and_document_term_matrix( max_features=999, exclude_stop_words=True ) vocabulary.append('gay') with open(vocabulary_path, 'wb') as outfile: pickle.dump(vocabulary, outfile) with open(vocabulary_path, 'rb') as infile: return pickle.load(infile)
def get_data(): d = JournalsDataset() d.get_vocabulary_and_document_term_matrix(vocabulary=['women', 'gender'], use_frequencies=True, store_in_df=True) from gender_history.visualizations.bechdel_plot import plot_bechdel b_women_male, b_women_female, x1 = plot_bechdel(term='women', dataset=d) b_gender_male, b_gender_female, _ = plot_bechdel(term='gender', dataset=d) f_gender_male, f_gender_female, x = ngram_plot('gender') f_women_male, f_women_female, _ = ngram_plot('women') f_topic_male, f_topic_female, _ = ngram_plot('topic.61') with open(Path(BASE_PATH, 'visualizations', 'gender_women.csv'), 'w') as outfile: csv_writer = csv.writer(outfile) csv_writer.writerow([ 'topic_male', 'topic_female', 'women_male', 'women_female', 'gender_male', 'gender_female', 'bechdel_women_male', 'bechdel_women_female', 'bechdel_gender_male', 'bechdel_gender_female', 'x', ]) for i in range(len(f_gender_male)): csv_writer.writerow([ f_topic_male[i], f_topic_female[i], f_women_male[i], f_women_female[i], f_gender_male[i], f_gender_female[i], b_women_male[i], b_women_female[i], b_gender_male[i], b_gender_female[i], x[i] ])
def analysis_nazi_history(): dataset_name = 'journals' analysis_type = 'topics' d = JournalsDataset() compare_to_overall_weights = True # retain only the articles scoring in the top 5% for topic 29 (Nazi Germany) d.topic_score_filter(29, min_percentile_score=95) # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') div = DivergenceAnalysis( d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning', compare_to_overall_weights=compare_to_overall_weights) div.run_divergence_analysis(number_of_terms_or_topics_to_print=10) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def ngram_plot(token): d = JournalsDataset() male = d.copy().filter(author_gender='male') female = d.copy().filter(author_gender='female') if not token.startswith('topic.'): male.get_vocabulary_and_document_term_matrix(vocabulary=[token], store_in_df=True, use_frequencies=True) female.get_vocabulary_and_document_term_matrix(vocabulary=[token], store_in_df=True, use_frequencies=True) male_data = [] female_data = [] for year in range(d.start_year, d.end_year + 1): male_data.append(male.df[male.df.m_year == year][token].mean()) female_data.append(female.df[female.df.m_year == year][token].mean()) rolling_male = pd.DataFrame(male_data).rolling( center=True, window=7).mean()[0].tolist()[2:-5] rolling_female = pd.DataFrame(female_data).rolling( center=True, window=7).mean()[0].tolist()[2:-5] x = [i for i in range(d.start_year, d.end_year + 1)][2:-5] plt.figure(figsize=(6, 6)) plt.plot(x, rolling_male, color='blue') plt.plot(x, rolling_female, color='red') plt.title(f'{token} in articles by men (blue) and women (red)') # plt.savefig(Path(BASE_PATH, 'visualizations', 'dataset_summaries', 'male_female_articles.png')) plt.show() return rolling_male, rolling_female, x
def analysis_sexuality_time_and_gender(): d = JournalsDataset() # d.filter(term_filter={'term': '[fF]reud', 'min_count': 2}) c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='men early', sub_corpus2_name='women late', analysis_type='terms', sort_by='dunning', compare_to_overall_weights=False, use_default_vocabulary=True) div.run_divergence_analysis(number_of_terms_or_topics_to_print=500) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=10)
def analysis_gender_time(): d = JournalsDataset() d.topic_score_filter(topic_id=61, min_percentile_score=90) c1 = d.copy().filter(start_year=1970, end_year=1989) c2 = d.copy().filter(start_year=1990, end_year=2009) print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='1970-1989', sub_corpus2_name='1990-2009', analysis_type='terms', sort_by='dunning', compare_to_overall_weights=False, use_default_vocabulary=True) div.run_divergence_analysis(number_of_terms_or_topics_to_print=20) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def get_number_of_male_and_female_authored_articles_by_year( start_year, end_year, dataset_name) -> (list, list, list): """ returns number of male and female authored articles and list of years for male/female subplot male + female data add up to 1, i.e. they show percentages of articles written by men and women :param start_year: :param end_year: :return: """ if dataset_name == 'journals': d = JournalsDataset() else: d = DissertationDataset() male_data = [0] * (d.end_year - d.start_year + 1) female_data = [0] * (d.end_year - d.start_year + 1) for _, row in d.df.iterrows(): if row.m_author_genders == 'male': male_data[row.m_year - d.start_year] += 1 elif row.m_author_genders == 'female': female_data[row.m_year - d.start_year] += 1 else: pass rolling_mean_male = pd.DataFrame(male_data).rolling( center=True, window=5, min_periods=1).mean()[0].tolist() rolling_mean_female = pd.DataFrame(female_data).rolling( center=True, window=5, min_periods=1).mean()[0].tolist() male_data = np.array(rolling_mean_male[start_year - d.start_year:end_year + 1 - d.start_year]) female_data = np.array( rolling_mean_female[start_year - d.start_year:end_year + 1 - d.start_year]) totals = male_data + female_data male_data = male_data / totals female_data = female_data / totals years = [i for i in range(start_year, end_year + 1)] assert len(male_data) == len(female_data) == len(years) return male_data, female_data, years
def draw_all_years(): for start_year in range(1960, 2010, 10): for dataset_name in ['dissertations', 'journals']: if dataset_name == 'journals': dataset = JournalsDataset() else: dataset = DissertationDataset() if start_year < 1980: continue dataset.filter(start_year=start_year, end_year=start_year + 9) draw_gender_frequency_scatterplot( dataset, figsize=12, show_labels=True, transparent_image=False, dynamic_y_coords=False, filename= f'topic_scatter_{dataset_name}_{start_year}-{start_year+9}.png', show_plot=True, title=f'{dataset_name.capitalize()}, {start_year}s')
def generate_lorenz_curves(): d = JournalsDataset() d_topic = d.copy().topic_score_filter(topic_id=61, min_percentile_score=90) for start_year in [1960, 1970, 1980, 1990, 2000]: year_d = d.copy().filter(start_year=start_year, end_year=start_year + 9) year_topic_d = d_topic.copy().filter(start_year=start_year, end_year=start_year + 9) lorenz_all = get_lorenz_data(year_d) gini_all = np.round(gini(year_d), 3) lorenz_topic = get_lorenz_data(year_topic_d) gini_topic = np.round(gini(year_topic_d), 3) fig, ax = plt.subplots(figsize=[6, 6]) ## scatter plot of Lorenz curve ax.scatter(np.arange(lorenz_all.size) / (lorenz_all.size - 1), lorenz_all, marker='x', color='darkgreen', s=100) ax.scatter(np.arange(lorenz_topic.size) / (lorenz_topic.size - 1), lorenz_topic, marker='+', color='blue', s=100) ## line plot of equality ax.plot([0, 1], [0, 1], color='k') ax.set_title( f'Lorenz Curve for the Women and Gender Topic in the {start_year}s\n' f'Green: all articles. Blue: top decile for women and gender.\n' f'Gini coefficient. all articles = {gini_all}. top decile = {gini_topic}' ) plt.show()
def get_default_vocabulary(self, no_terms=1000): """ Loads a default vocabulary of no_terms, i.e. a vocabulary generated with a journal dataset without any filters :param no_terms: :return: """ vocabulary_path = Path(BASE_PATH, 'data', 'dtms', f'vocabulary_{no_terms}.pickle') if vocabulary_path.exists(): with open(vocabulary_path, 'rb') as infile: return pickle.load(infile) else: print(f"Generating new standard vocabulary with {no_terms} terms.") from gender_history.datasets.dataset_journals import JournalsDataset jd = JournalsDataset() _, vocabulary = jd.get_vocabulary_and_document_term_matrix( max_features=no_terms, ) with open(vocabulary_path, 'wb') as outfile: pickle.dump(vocabulary, outfile) return vocabulary
def plot_all_topics_and_general_approaches(): dataset = JournalsDataset(use_equal_samples_dataset=True) for column in dataset.df.columns: print(column) if column.startswith('gen_approach'): topic_percentile_plot( dataset, selection_column=column, selection_name=f'General Approach {column[13:]}', filename=f'{column.replace(" ", "_")}.png', show_plot=False) if column.startswith('topic.'): topic_no = int(column[6:]) topic_name = dataset.topics[topic_no]['name'] topic_percentile_plot( dataset, selection_column=column, selection_name=f'Topic {topic_name}', filename=f'{topic_no}_{topic_name.replace(" ", "_")}.png', show_plot=False)
def get_1percent_ratios(): dataset = JournalsDataset() # dataset.filter(start_year=1980) df = dataset.df results = {} for column in df.columns: if column.startswith('topic') or column.startswith('gen_a'): top1p = df[df[column] >= df[column].quantile(0.99)] male = len(top1p[top1p.m_author_genders == 'male']) female = len(top1p[top1p.m_author_genders == 'female']) if column.startswith('topic'): name = dataset.topics[int(column[6:])]['name'] else: name = column print(column, name, female / male, male, female) results[name] = female / male * 7506 / 2016 for x in sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]: print(x)
def analysis_military_history(): for dataset_name in ['journals']: for analysis_type in ['topics']: print('\n\n\n', dataset_name, analysis_type) if dataset_name == 'journals': d = JournalsDataset() else: d = DissertationDataset() if analysis_type == 'topics': compare_to_overall_weights = True else: compare_to_overall_weights = False # retain only the articles scoring in the top 10% for topic 31 (military history) d.topic_score_filter(31, min_percentile_score=90) # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') div = DivergenceAnalysis( d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning', compare_to_overall_weights=compare_to_overall_weights) div.run_divergence_analysis(number_of_terms_or_topics_to_print=10) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def get_individual_topic_viz_data(terms, topic_name, d: JournalsDataset=None): if not d: d = JournalsDataset() d.get_vocabulary_and_document_term_matrix(vocabulary=terms, use_frequencies=True, store_in_df=True) men = d.copy().filter(author_gender='male') women = d.copy().filter(author_gender='female') data = defaultdict(list) for year in range(d.start_year, d.end_year + 1): men_year = men.copy().filter(start_year=year, end_year=year) women_year = women.copy().filter(start_year=year, end_year=year) all_year = d.copy().filter(start_year=year, end_year=year) print(year, len(all_year)) for term in terms: data[f'men_{term}'].append(men_year.df[term].mean()) data[f'women_{term}'].append(women_year.df[term].mean()) data[f'all_{term}'].append(all_year.df[term].mean()) avg_data = {} for term in terms: # avg_data[f'men_{term}'] = pd.DataFrame(data[f'men_{term}']).rolling(center=True, # window=5).mean()[0].tolist()[2:-5] # avg_data[f'women_{term}'] = pd.DataFrame(data[f'women_{term}']).rolling(center=True, # window=5).mean()[0].tolist()[2:-5] avg_data[f'all_{term}'] = pd.DataFrame(data[f'all_{term}']).rolling(center=True, window=5).mean()[0].tolist()[2:-5] avg_data['year'] = [int(i) for i in range(d.start_year, d.end_year + 1)][2:-5] df = pd.DataFrame.from_dict(avg_data, orient='index').transpose() df['year'] = df['year'].astype(int) df.to_csv(Path(BASE_PATH, 'visualizations', 'plotly_data', f'{topic_name}_topic_data.csv'))
def analysis_term_gender(): d = JournalsDataset() d.filter(term_filter={'term': 'gender', 'min_count': 10}) c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='male', sub_corpus2_name='female', analysis_type='terms', sort_by='frequency_score', compare_to_overall_weights=False, use_default_vocabulary=False) div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)
# # dataset.filter(start_year=2000) # # dataset = JournalsDataset() ''' # post 2000 1534 articles by men, 1040 mention womaen 68% 751 by women, 633 mention womaen 84% # before 1970 1633 by men, 526 mention womaen 32% 140 by women, 63 mention womaen 45% ''' dataset = JournalsDataset() # dataset.get_vocabulary_and_document_term_matrix(max_features=10000, use_frequencies=True, # store_in_df=True) # dataset.df['womaen'] = dataset.df['women'] + dataset.df['woman'] # embed() # topic_percentile_plot(dataset=dataset, selection_column="gender", # selection_name='term: gender', # filename='term_gender.png') topic_percentile_plot(dataset=dataset, selection_column="topic.71", selection_name='Women and Gender', filename='61_percentiles.png') # topic_percentile_plot(dataset=dataset, selection_column="gender", # selection_name='term: gender',
def draw_scatterplots_of_journals(): """ Creates scatter plots for all journals as well as (JAH + AHR) and all journals minus History and Theory :return: """ valid_journals = { 'Comparative Studies in Society and History', 'The Journal of Modern History', 'The Journal of American History', 'Journal of World History', 'The Journal of Interdisciplinary History', 'Journal of Social History', 'The American Historical Review', 'Reviews in American History', 'History and Theory', 'Ethnohistory' } for journal in valid_journals: dataset = JournalsDataset() dataset.filter_by_journal([journal]) draw_gender_frequency_scatterplot( dataset, figsize=36, show_labels=True, transparent_image=False, filename=f'single_journal{journal.replace(" ", "_")}.png', dynamic_y_coords=True) # all except history and theory valid_journals.remove('History and Theory') dataset = JournalsDataset() dataset.filter_by_journal(list(valid_journals)) draw_gender_frequency_scatterplot( dataset, figsize=36, show_labels=True, transparent_image=False, filename=f'all_except_history_and_theory.png', dynamic_y_coords=True) # AHR and JAH dataset = JournalsDataset() dataset.filter_by_journal( ['The Journal of American History', 'The American Historical Review']) draw_gender_frequency_scatterplot(dataset, figsize=36, show_labels=True, transparent_image=False, filename=f'ahr_and_jah.png', dynamic_y_coords=True)
def load_master_viz_data(mode, smoothing=5): """ For every term or topic in the token list, this function returns a dict consisting of: - year (list of years in the dataset) - freq_score (list of yearly frequency scores) - freq (list of yearly frequency of the term) - mean_freq_score, mean_freq, freq_score_range (floats) :param dataset: :param token_list: :param smoothing: :return: """ if not mode in {'terms', 'topics'}: raise ValueError('mode has to be either "terms" or "topics".') master_viz_path = Path(BASE_PATH, 'data', 'dtms', f'viz_data_{mode}.pickle') if master_viz_path.exists(): with open(master_viz_path, 'rb') as infile: return pickle.load(infile) print(f"Creating new master viz dataset for {mode}.") dataset = JournalsDataset() if mode == 'terms': _, master_vocabulary = dataset.get_vocabulary_and_document_term_matrix( max_features=100000) # load text info and turn it into term frequencies dataset.get_vocabulary_and_document_term_matrix( vocabulary=master_vocabulary, use_frequencies=True, store_in_df=True) else: master_vocabulary = [f'topic.{i}' for i in range(1, 91)] for column in dataset.df.columns: if column.startswith('gen_approach_'): master_vocabulary.append(column) data = {} for t in master_vocabulary: data[t] = defaultdict(list) df = dataset.df # create time slices for every year for idx, year in enumerate(range(dataset.start_year, dataset.end_year + 1)): print(year) time_slice = df[(df.m_year >= year - smoothing) & (df.m_year <= year + smoothing)] time_slice_female = time_slice[time_slice.m_author_genders == 'female'] time_slice_male = time_slice[time_slice.m_author_genders == 'male'] for t in master_vocabulary: freq_both = time_slice[t].mean() freq_female = time_slice_female[t].mean() freq_male = time_slice_male[t].mean() # if a term doesn't appear, it is neutral if (freq_male + freq_female) == 0: freq_score = 0.5 else: freq_score = freq_female / (freq_female + freq_male) data[t]['year'].append(year) data[t]['freq_score'].append(freq_score) data[t]['freq'].append(freq_both) data[t]['freq_male'].append(freq_male) data[t]['freq_female'].append(freq_female) for t in master_vocabulary: data[t]['mean_freq_score'] = np.mean(data[t]['freq_score']) data[t]['mean_freq'] = np.mean(data[t]['freq']) data[t]['freq_score_range'] = max(data[t]['freq_score']) - min( data[t]['freq_score']) with open(master_viz_path, 'wb') as outfile: pickle.dump(data, outfile) return data
center=True, window=5).mean()[0].tolist()[2:-5] rolling_mean_female = pd.DataFrame(female_data).rolling( center=True, window=5).mean()[0].tolist()[2:-5] ax.plot(x, rolling_mean_male, color='blue') ax.plot(x, rolling_mean_female, color='red') ax.set(ylim=(0, 1)) ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0)) plt.title(f'Percentage of articles using the word "{term}" at least once') plt.savefig( Path(BASE_PATH, 'visualizations', 'bechdel', f'bechdel_{term}.png')) plt.show() return rolling_mean_male, rolling_mean_female, x if __name__ == '__main__': dataset = JournalsDataset() dataset.get_vocabulary_and_document_term_matrix(max_features=10000, use_frequencies=True, store_in_df=True) plot_bechdel(term='she', dataset=dataset) plot_bechdel(term='women', dataset=dataset) plot_bechdel(term='gender', dataset=dataset) # plot_bechdel(term='sex', dataset=dataset) # plot_bechdel(term='sexuality', dataset=dataset)
def plot_topic_frequency_with_6_terms(topic_id, term_list, store_plot=True): """ Plots the topic weight over time with the frequency charts of 6 key terms The term_list can be selected from term prob, frex, and divergence analysis. :param topic_id: int :param term_list: :param store_plot: :return: """ dataset = JournalsDataset() master_viz_data = load_master_viz_data(mode='terms') fig = plt.figure(figsize=(5 * 12, 2 * 12)) gs = gridspec.GridSpec(nrows=2, ncols=6, figure=fig, wspace=0.2, hspace=0.2, # final 0.5 is to draw the colorbar into width_ratios=[5, 5, 5, 5, 5, 0.5] ) # draw the topic weight plot into the first 2x2 chart section ax_topic = fig.add_subplot(gs[0:2, 0:2]) create_ngram_plot(subplot_ax=ax_topic, term_or_topic_list=[f'topic.{topic_id}'], plot_title="Overall Topic Weight", scale_factor=2) # slightly reduce title font size and padding and add y axis label ax_topic.set_ylabel('Mean topic weight', fontsize=28) ax_topic.set_title(label='Overall Topic Weight', weight='bold', fontsize=32, pad=30) # add the six terms for idx, term in enumerate(term_list): row = idx // 3 col = idx % 3 + 2 print(row, col, term) ax = fig.add_subplot(gs[row, col]) create_ngram_plot( subplot_ax=ax, term_or_topic_list=[term], plot_title=term.capitalize(), master_viz_data=master_viz_data ) # Draw colorbar lc = LineCollection([], cmap='coolwarm', norm=plt.Normalize(0.0, 1.0)) cbar_ax = fig.add_subplot(gs[:, 5]) cbar = fig.colorbar(lc, cax=cbar_ax, ticks = [0.025, 0.975], fraction=0.03) cbar.ax.set_yticklabels(['Only men \nuse a term', 'Only women \nuse a term']) cbar.ax.tick_params(labelsize=28) # Draw title title = f'{dataset.topics[topic_id]["name"]}: Overall Weight and Key Terms' fig.suptitle(title, fontsize=60, weight='bold') # Add y axis labels for the first term plots fig.get_axes()[1].set_ylabel('Mean term frequency', fontsize=14) fig.get_axes()[4].set_ylabel('Mean term frequency', fontsize=14) if store_plot: filename = f'{topic_id}_{dataset.topics[topic_id]["name"]}.png' plt.savefig(Path(BASE_PATH, 'visualizations', 'topic_frequency_plots', filename)) plt.show()