def get_nn_indexes(size_experiments, df=None, col_class='author', test_size=0.2, random_state=8, save=True, path_index_nn='data/pickle/recognition/author', mode_selection=1): df_index_nn = try_to_load_as_pickled_object_or_None( '{}/df_index_{}.p'.format(path_index_nn, col_class)) if df_index_nn is None: if df is None: df = load_corpus() idxex = [] for nb_classes in size_experiments: (nb_classes, index_train, index_test) = get_train_test_index(nb_classes, df, col_class=col_class, test_size=test_size, random_state=random_state, mode_selection=mode_selection) idxex.append((nb_classes, index_train, index_test)) df_index_nn = pd.DataFrame( idxex, columns=['nb_classes', 'index_train', 'index_test']) if save: save_as_pickled_object( df_index_nn, '{}/df_index_{}.p'.format(path_index_nn, col_class)) return df_index_nn
def get_temporal(df=None): if df is None: df = load_corpus() df_temporal = df.copy() df_temporal['author_birthdate'] = df_temporal['author_birthdate'].apply(int_or_nan) df_temporal['author_deathdate'] = df_temporal['author_deathdate'].apply(int_or_nan) df_temporal['author_birthdate_bin'] = df_temporal['author_birthdate'].apply(bin_date) df_temporal['author_deathdate_bin'] = df_temporal['author_deathdate'].apply(bin_date) df_temporal['author_middle_age'] = df_temporal[['author_birthdate', 'author_deathdate']].apply( lambda row: middle_age(row[0], row[1]),axis=1) df_temporal['author_middle_age_bin'] = df_temporal['author_middle_age'].apply(bin_date) return df_temporal
def compute_baseline(feature, df=None, distance=d_KL, nb_books=1000, col_unique='book_id', col_class='author', path_consistency='data/pickle/consistency/author', reshuffle=False): if df is None: df = load_corpus() new_selection_df = \ try_to_load_as_pickled_object_or_None('{}/df_consistency_baseline_{}_compared.p'.\ format(path_consistency, feature)) if new_selection_df is None or reshuffle: new_selection_df = get_merged_data(feature, df=df, col_unique=col_unique, col_class=col_class, path_consistency=path_consistency) new_selection_df = \ new_selection_df[new_selection_df['{}_x'.format(col_class)]\ !=new_selection_df['{}_y'.format(col_class)]] list_books = random.sample(list(new_selection_df.index), nb_books) new_selection_df = new_selection_df[new_selection_df.index.isin( list_books)] feature_x = feature + '_x' feature_y = feature + '_y' new_selection_df[feature+'_compare'] = new_selection_df.apply(lambda row: \ distance(row[feature_x], row[feature_y]), axis=1) reshuffle_index = secrets.token_hex(4) if reshuffle else '' save_as_pickled_object(new_selection_df, '{}/df_consistency_baseline_{}_compared{}.p'.\ format(path_consistency, feature, reshuffle_index)) baseline = new_selection_df[feature + '_compare'].mean() print(str((feature, nb_books, baseline, col_class)), file=open('{}/baseline_{}.txt'.format(path_consistency, col_class), 'a')) return baseline, new_selection_df
def get_distance_matrices( list_auth, feature, feature_name, df=None, distance=d_KL, col_name='author', path_distance_matrices='data/pickle/distance_matrices/author'): distance_matrix = try_to_load_as_pickled_object_or_None( '{}/dmat_{}_{}.p'.format(path_distance_matrices, len(list_auth), feature_name)) list_auths = try_to_load_as_pickled_object_or_None( '{}/list_auth_{}_{}.p'.format(path_distance_matrices, len(list_auth), feature_name)) if distance_matrix is None or list_auths is None: if df is None: df = load_corpus() df_auth = df[df[col_name].isin(list_auth)] df_auth[col_name] = df_auth[col_name].astype("category") df_auth[col_name].cat.set_categories(list_auth, inplace=True) df_auth.sort_values([col_name], inplace=True) distance_matrix = np.zeros((len(df_auth), len(df_auth)), dtype='f') for i in range(0, len(df_auth)): for j in range(0, len(df_auth)): res1 = df_auth[feature].iloc[i] res2 = df_auth[feature].iloc[j] res = distance(res1, res2) distance_matrix[i, j] = res list_auth = list(df_auth[col_name].drop_duplicates()) list_auths = [list_auth.index(i) for i in list(df_auth[col_name])] save_as_pickled_object( distance_matrix, '{}/dmat_{}_{}.p'.format(path_distance_matrices, len(list_auth), feature_name)) save_as_pickled_object( list_auths, '{}/list_auth_{}_{}.p'.format(path_distance_matrices, len(list_auth), feature_name)) convert_to_matlab( distance_matrix, list_auths, '{}/matlab_{}_{}'.format(path_distance_matrices, len(list_auth), feature_name)) return distance_matrix, list_auths
def return_consistency_data(feature, distance=d_KL, path_consistency='data/pickle/consistency/author', df=None, col='author', col_unique='book_id'): df_consistency = try_to_load_as_pickled_object_or_None( '{}/df_consitency_{}.p'.format(path_consistency, feature)) if df_consistency is None: if df is None: df = load_corpus() df_consistency = get_consistency(df, feature=feature, distance=distance, col=col, col_unique=col_unique) save_as_pickled_object( df_consistency, '{}/df_consitency_{}.p'.format(path_consistency, feature)) return df_consistency
def get_merged_data(feature, df=None, col_unique='book_id', col_class='author', path_consistency='data/pickle/consistency'): new_selection_df = \ try_to_load_as_pickled_object_or_None('{}/df_consistency_baseline_{}.p'.\ format(path_consistency, feature)) if new_selection_df is None: if df is None: df = load_corpus() selection_df = df.copy() selection_df['to_merge'] = 1 new_selection_df = pd.merge( selection_df[['to_merge', feature, col_unique, col_class]], selection_df[['to_merge', feature, col_unique, col_class]], on='to_merge') save_as_pickled_object(new_selection_df, '{}/df_consistency_baseline_{}.p'.\ format(path_consistency, feature)) return new_selection_df
from punctuation.recognition.recognition_algorithms import ( launch_nearest_neighbour, launch_neural_net) from punctuation.config import options from punctuation.utils.utils import (load_corpus, int_or_nan) from punctuation.time_series.time_functions import (get_temporal, plot_histogram_years, plot_freq_overtime, plot_col_overtime) import pandas as pd import numpy as np import matplotlib.style import matplotlib as mpl mpl.style.use('seaborn-paper') df = load_corpus() df_temporal = get_temporal(df=df) plot_histogram_years(df_temporal, show_middleyear=False, to_show=True, print_legend=False) plot_histogram_years(df_temporal, show_middleyear=True, to_show=True, print_legend=False) list_freq_pun_col = list(range(options.nb_signs)) freq_pun_col_1 = [1, 4, 5]
plot_trans_mat(np.reshape( np.array(df[options.norm_transition_mat_col].mean()), (10, 10)), punctuation_vector=options.punctuation_vector) plot_scatter_freqs(df, title1=None, title2=None, freq1=None, freq2=None, font_size=18) full_run = False ### AUTHOR ANALYSIS df = load_corpus() sys.exit(2) plot_list_class(df, class_name='author') plot_list_class(df, class_name='genre') plot_list_class(df, class_name='author_birthdate') #4. corpus overall info plot_features(df) get_overall_hist(df, subfile='freq_pun', punctuation_vector=options.punctuation_vector[:-1] + ['...'], freq_pun_col=options.freq_pun_col)