Exemplo n.º 1
0
 def __init__(self):
     df = loading.load_extended_posts()
     df = feature_engineering.add_column_text(df)
     self.df = feature_engineering.add_column_label_needsmoderation(df)
     self.current_label = None
     self.balance_method = None
     self.sampling_strategy = 1
Exemplo n.º 2
0
def get_augmented_val_X_y(X, y, label):
    '''get a dataset with augmented texts for the minority positive label
    Arguments: X, y - pandas series containing the validation data that needs to be augmented
               label - label that needs to be augmented
               sampling_strategy - float representing the proportion of positive vs negative labels in the augmented dataframe (range [>0.0; <=1.ß])
    Return: augmented X, y'''

    label_range = [
        'label_sentimentnegative', 'label_inappropriate',
        'label_discriminating', 'label_offtopic', 'label_needsmoderation',
        'label_negative'
    ]

    if label in label_range:
        file_cached = "./cache/df_r3.csv"

        try:
            df_aug = pd.read_csv(f'./output/trans_val_{label}.csv')
            X_aug = df_aug['text']
            y_aug = df_aug[label]
            X, y = pd.concat((X, X_aug)), pd.concat((
                y,
                y_aug,
            ))

        except FileNotFoundError as e:
            pass
        try:
            df_r3 = pd.read_csv(file_cached)

        except:
            df_r3 = loading.load_extended_posts(label=label)
            df_r3 = feature_engineering.add_column_ann_round(df_r3)
            df_r3 = feature_engineering.add_column_text(df_r3)
            df_r3 = df_r3.query('ann_round==3').copy()
            df_r3.to_csv(file_cached)

        df_r3 = feature_engineering.add_column_label_needsmoderation(df_r3)
        art_list = list(df_r3.id_article.unique())
        df_ann = pd.DataFrame(columns=df_r3.columns)

        for i in art_list:
            df_ann = pd.concat(
                (df_ann, df_r3.query(f'id_article=={i} and {label}==1').sample(
                    1, random_state=42)))

        return pd.concat((X, df_ann['text'])), pd.concat((y, df_ann[label]))

    else:
        print(
            f'Requested augmentation data for label {label} not available. Returned original X,y'
        )
        return X, y
Exemplo n.º 3
0
def create_splits():
    """
    Create test-train-val split of labeled data and save it to csv.
    Annotation round 2 and round 3 are split seperately.
    The csv files are written to ./data and contain the id_post.
    Args: None
    Returns: None
    """
    RSEED = 42
    df_posts = loading.load_extended_posts()
    df_posts = feature_engineering.add_column_ann_round(df_posts)

    # The first round of EDA showed, that only posts annotated in round 2 represent the population.
    # Posts annotated in round 3 will only be used for the labels "possiblyFeedback" and
    # "personalStories" (i.e. NaN in any of the other lables) and only in the training set.
    df_ann2 = df_posts.query("ann_round == 2")
    df_ann3_feedback_stories = df_posts.query(
        "ann_round == 3 and label_offtopic != label_offtopic")

    # Due to a small dataset (1,000 posts) we want to keep 100 observations for test and validation split each
    # We stratify by labels, that are least frequent in our 1,000 observations
    features_strat = [
        'label_discriminating', 'label_possiblyfeedback',
        'label_personalstories', 'label_sentimentpositive'
    ]
    ann2_train, ann2_test = train_test_split(df_ann2,
                                             stratify=df_ann2[features_strat],
                                             random_state=RSEED,
                                             test_size=100)
    ann2_train, ann2_val = train_test_split(
        ann2_train,
        stratify=ann2_train[features_strat],
        random_state=RSEED,
        test_size=100)

    df_train = pd.concat([ann2_train, df_ann3_feedback_stories], axis=0)

    print(f"Number of posts in train-set: {df_train.shape[0]}")
    print(f"Number of posts in val-set: {ann2_val.shape[0]}")
    print(f"Number of posts in test-set: {ann2_test.shape[0]}")
    df_train.id_post.to_csv('./data/ann2_train.csv', header=False)
    ann2_test.id_post.to_csv('./data/ann2_test.csv', header=False)
    ann2_val.id_post.to_csv('./data/ann2_val.csv', header=False)
    print('Splits created.')
Exemplo n.º 4
0
        'Inappropriate', 
        'Discriminating', 
        'ArgumentsUsed',
        'PersonalStories', 
        'SentimentPositive',
        'PossiblyFeedback', 
    ]

# %%
y_col_dict = {r:c for r,c in zip(y_col_grouped, y_col_grouped_clean)}

# %% [markdown]
# ## Scores Zero Shot

# %%
data = loading.load_extended_posts()
data = feature_engineering.add_column_ann_round(data)
data.fillna(value={'headline':'', 'body':''}, inplace=True)
data['text'] = data['headline']+" "+data['body']
data['text']=data.text.str.replace('\n',' ').str.replace('\r', ' ')
data_1000 = data.query('ann_round==2').copy()

# %%
data_pred_1000 = pd.read_csv('../output/zero_shot_result_1000.csv', index_col=0)
data_merge_1000 = pd.merge(data_1000, data_pred_1000, how='left', on = 'id_post', suffixes = ('_true', '_pred'))
scores_1000_05 = scoring.get_score_df(data_merge_1000)
scores_1000_best = scoring.get_score_df(data_merge_1000, best=True)

# %%
scores_zeroshot = scores_1000_05[['label', 'f1_pred']].query('label in @y_col').copy()
scores_zeroshot['model'] = pd.Series(['xlm-roberta-large-xnli']*9)
Exemplo n.º 5
0
# ---

# %% [markdown]
# # Analyze comments vs article categories #2
# Issue link: https://github.com/dominikmn/one-million-posts/issues/2

# %%
from utils import loading, feature_engineering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# %%
df_posts = loading.load_extended_posts()

# %%
df_articles = loading.load_articles()

# %% [markdown]
# ## Data preparation

# %% [markdown]
# ### Encode post labels

# %%
cols_label = [c for c in df_posts.columns if c.startswith('label_')]

# %%
df_posts[cols_label] = df_posts[cols_label].replace({0.0: 'no', 1.0: 'yes'})
Exemplo n.º 6
0
        f1_val = f1_score(y_val, y_pred_val_best)
        precision_val = precision_score(y_val, y_pred_val_best)
        recall_val = recall_score(y_val, y_pred_val_best)
        f1_train = f1_score(y_train_best, y_pred_train_best)
        precision_train = precision_score(y_train_best, y_pred_train_best)
        recall_train = recall_score(y_train_best, y_pred_train_best)
        log_metrics(f1_val, recall_val, precision_val, f1_train, recall_train,
                    precision_train)
        scoring.log_cm(y_train_best, y_pred_train_best, y_val, y_pred_val_best)
        params['sampling_strategy'] = round(best_perc, 2)
        sv_model(params, best_model)
        endrun()


if __name__ == '__main__':
    df_train = loading.load_extended_posts(split='train')
    df_val = loading.load_extended_posts(split='val')

    df_train.fillna(value={'headline': '', 'body': ''}, inplace=True)
    df_train['text'] = df_train['headline'] + " " + df_train['body']
    df_train['text'] = df_train.text.str.replace('\n',
                                                 ' ').str.replace('\r', ' ')
    df_val.fillna(value={'headline': '', 'body': ''}, inplace=True)
    df_val['text'] = df_val['headline'] + " " + df_val['body']
    df_val['text'] = df_val.text.str.replace('\n', ' ').str.replace('\r', ' ')

    y_cols = [
        'label_argumentsused', 'label_discriminating', 'label_inappropriate',
        'label_offtopic', 'label_personalstories', 'label_possiblyfeedback',
        'label_sentimentnegative', 'label_sentimentpositive'
    ]
Exemplo n.º 7
0
    '''get a complete dataset with translations for augmentation
    Arguments: df - a pandas dataframe
               col - the column name of the column containing german texts
               lang_list - a list of languages for translation
               constructed url - the url to connect to the api
               headers - the headers for the request to the api
    Return: df2 - a complete dataset with the (back-)translated texts to augment the positive labels up to 50%'''
    df_temp = pd.DataFrame(columns=df.columns)
    for lang in lang_list:
        df_trans = translate_azure(df, col, lang, constructed_url, headers)
        df_temp = pd.concat((df_temp, df_trans))
    return df_temp


if __name__ == '__main__':
    df = loading.load_extended_posts(split='train')
    df.fillna(value={'headline': '', 'body': ''}, inplace=True)
    df['text'] = df['headline'] + " " + df['body']
    df['text'] = df.text.str.replace('\n', ' ').str.replace('\r', ' ')
    constructed_url, headers = get_construction(subscription_key, endpoint,
                                                location)
    label_list = [
        'label_argumentsused', 'label_discriminating', 'label_inappropriate',
        'label_offtopic', 'label_personalstories', 'label_possiblyfeedback',
        'label_sentimentnegative', 'label_sentiment_positive'
    ]
    try:
        for label in label_list:
            print(f'started {label}')
            mult = get_mult(df, label)
            lang_list = get_lang(mult)