예제 #1
0
def get_nn_indexes(size_experiments,
                   df=None,
                   col_class='author',
                   test_size=0.2,
                   random_state=8,
                   save=True,
                   path_index_nn='data/pickle/recognition/author',
                   mode_selection=1):
    df_index_nn = try_to_load_as_pickled_object_or_None(
        '{}/df_index_{}.p'.format(path_index_nn, col_class))

    if df_index_nn is None:
        if df is None: df = load_corpus()
        idxex = []
        for nb_classes in size_experiments:
            (nb_classes, index_train,
             index_test) = get_train_test_index(nb_classes,
                                                df,
                                                col_class=col_class,
                                                test_size=test_size,
                                                random_state=random_state,
                                                mode_selection=mode_selection)
            idxex.append((nb_classes, index_train, index_test))
        df_index_nn = pd.DataFrame(
            idxex, columns=['nb_classes', 'index_train', 'index_test'])
        if save:
            save_as_pickled_object(
                df_index_nn, '{}/df_index_{}.p'.format(path_index_nn,
                                                       col_class))
    return df_index_nn
예제 #2
0
def get_temporal(df=None):
    if df is None:
        df = load_corpus()
        
    df_temporal = df.copy()
    df_temporal['author_birthdate'] = df_temporal['author_birthdate'].apply(int_or_nan)
    df_temporal['author_deathdate'] = df_temporal['author_deathdate'].apply(int_or_nan)
    
    df_temporal['author_birthdate_bin'] = df_temporal['author_birthdate'].apply(bin_date)
    df_temporal['author_deathdate_bin'] = df_temporal['author_deathdate'].apply(bin_date)
    df_temporal['author_middle_age'] = df_temporal[['author_birthdate',
                                                    'author_deathdate']].apply(
            lambda row: middle_age(row[0], row[1]),axis=1)
    df_temporal['author_middle_age_bin'] = df_temporal['author_middle_age'].apply(bin_date)

    return df_temporal
예제 #3
0
def compute_baseline(feature,
                     df=None,
                     distance=d_KL,
                     nb_books=1000,
                     col_unique='book_id',
                     col_class='author',
                     path_consistency='data/pickle/consistency/author',
                     reshuffle=False):

    if df is None: df = load_corpus()

    new_selection_df = \
    try_to_load_as_pickled_object_or_None('{}/df_consistency_baseline_{}_compared.p'.\
                                          format(path_consistency,
                                                 feature))

    if new_selection_df is None or reshuffle:
        new_selection_df = get_merged_data(feature,
                                           df=df,
                                           col_unique=col_unique,
                                           col_class=col_class,
                                           path_consistency=path_consistency)

        new_selection_df = \
            new_selection_df[new_selection_df['{}_x'.format(col_class)]\
                                !=new_selection_df['{}_y'.format(col_class)]]

        list_books = random.sample(list(new_selection_df.index), nb_books)
        new_selection_df = new_selection_df[new_selection_df.index.isin(
            list_books)]
        feature_x = feature + '_x'
        feature_y = feature + '_y'
        new_selection_df[feature+'_compare'] = new_selection_df.apply(lambda row: \
                 distance(row[feature_x], row[feature_y]),
                 axis=1)
        reshuffle_index = secrets.token_hex(4) if reshuffle else ''
        save_as_pickled_object(new_selection_df, '{}/df_consistency_baseline_{}_compared{}.p'.\
                                              format(path_consistency,
                                                     feature,
                                                     reshuffle_index))

    baseline = new_selection_df[feature + '_compare'].mean()
    print(str((feature, nb_books, baseline, col_class)),
          file=open('{}/baseline_{}.txt'.format(path_consistency, col_class),
                    'a'))

    return baseline, new_selection_df
def get_distance_matrices(
        list_auth,
        feature,
        feature_name,
        df=None,
        distance=d_KL,
        col_name='author',
        path_distance_matrices='data/pickle/distance_matrices/author'):
    distance_matrix = try_to_load_as_pickled_object_or_None(
        '{}/dmat_{}_{}.p'.format(path_distance_matrices, len(list_auth),
                                 feature_name))
    list_auths = try_to_load_as_pickled_object_or_None(
        '{}/list_auth_{}_{}.p'.format(path_distance_matrices, len(list_auth),
                                      feature_name))

    if distance_matrix is None or list_auths is None:

        if df is None:
            df = load_corpus()
        df_auth = df[df[col_name].isin(list_auth)]
        df_auth[col_name] = df_auth[col_name].astype("category")
        df_auth[col_name].cat.set_categories(list_auth, inplace=True)
        df_auth.sort_values([col_name], inplace=True)

        distance_matrix = np.zeros((len(df_auth), len(df_auth)), dtype='f')
        for i in range(0, len(df_auth)):
            for j in range(0, len(df_auth)):
                res1 = df_auth[feature].iloc[i]
                res2 = df_auth[feature].iloc[j]
                res = distance(res1, res2)
                distance_matrix[i, j] = res
        list_auth = list(df_auth[col_name].drop_duplicates())
        list_auths = [list_auth.index(i) for i in list(df_auth[col_name])]
        save_as_pickled_object(
            distance_matrix, '{}/dmat_{}_{}.p'.format(path_distance_matrices,
                                                      len(list_auth),
                                                      feature_name))
        save_as_pickled_object(
            list_auths, '{}/list_auth_{}_{}.p'.format(path_distance_matrices,
                                                      len(list_auth),
                                                      feature_name))

    convert_to_matlab(
        distance_matrix,
        list_auths, '{}/matlab_{}_{}'.format(path_distance_matrices,
                                             len(list_auth), feature_name))
    return distance_matrix, list_auths
예제 #5
0
def return_consistency_data(feature,
                            distance=d_KL,
                            path_consistency='data/pickle/consistency/author',
                            df=None,
                            col='author',
                            col_unique='book_id'):
    df_consistency = try_to_load_as_pickled_object_or_None(
        '{}/df_consitency_{}.p'.format(path_consistency, feature))
    if df_consistency is None:
        if df is None: df = load_corpus()
        df_consistency = get_consistency(df,
                                         feature=feature,
                                         distance=distance,
                                         col=col,
                                         col_unique=col_unique)
        save_as_pickled_object(
            df_consistency,
            '{}/df_consitency_{}.p'.format(path_consistency, feature))

    return df_consistency
예제 #6
0
def get_merged_data(feature,
                    df=None,
                    col_unique='book_id',
                    col_class='author',
                    path_consistency='data/pickle/consistency'):


    new_selection_df = \
    try_to_load_as_pickled_object_or_None('{}/df_consistency_baseline_{}.p'.\
                                          format(path_consistency,
                                                 feature))
    if new_selection_df is None:
        if df is None: df = load_corpus()
        selection_df = df.copy()
        selection_df['to_merge'] = 1
        new_selection_df = pd.merge(
            selection_df[['to_merge', feature, col_unique, col_class]],
            selection_df[['to_merge', feature, col_unique, col_class]],
            on='to_merge')

        save_as_pickled_object(new_selection_df, '{}/df_consistency_baseline_{}.p'.\
                                          format(path_consistency,
                                                 feature))
    return new_selection_df
from punctuation.recognition.recognition_algorithms import (
    launch_nearest_neighbour, launch_neural_net)

from punctuation.config import options
from punctuation.utils.utils import (load_corpus, int_or_nan)
from punctuation.time_series.time_functions import (get_temporal,
                                                    plot_histogram_years,
                                                    plot_freq_overtime,
                                                    plot_col_overtime)
import pandas as pd
import numpy as np
import matplotlib.style
import matplotlib as mpl
mpl.style.use('seaborn-paper')

df = load_corpus()
df_temporal = get_temporal(df=df)

plot_histogram_years(df_temporal,
                     show_middleyear=False,
                     to_show=True,
                     print_legend=False)

plot_histogram_years(df_temporal,
                     show_middleyear=True,
                     to_show=True,
                     print_legend=False)

list_freq_pun_col = list(range(options.nb_signs))

freq_pun_col_1 = [1, 4, 5]
예제 #8
0
    plot_trans_mat(np.reshape(
        np.array(df[options.norm_transition_mat_col].mean()), (10, 10)),
                   punctuation_vector=options.punctuation_vector)

    plot_scatter_freqs(df,
                       title1=None,
                       title2=None,
                       freq1=None,
                       freq2=None,
                       font_size=18)


full_run = False

### AUTHOR ANALYSIS
df = load_corpus()

sys.exit(2)

plot_list_class(df, class_name='author')
plot_list_class(df, class_name='genre')
plot_list_class(df, class_name='author_birthdate')

#4. corpus overall info
plot_features(df)

get_overall_hist(df,
                 subfile='freq_pun',
                 punctuation_vector=options.punctuation_vector[:-1] + ['...'],
                 freq_pun_col=options.freq_pun_col)