Exemplo n.º 1
0
def get_augmented_val_X_y(X, y, label):
    '''get a dataset with augmented texts for the minority positive label
    Arguments: X, y - pandas series containing the validation data that needs to be augmented
               label - label that needs to be augmented
               sampling_strategy - float representing the proportion of positive vs negative labels in the augmented dataframe (range [>0.0; <=1.ß])
    Return: augmented X, y'''

    label_range = [
        'label_sentimentnegative', 'label_inappropriate',
        'label_discriminating', 'label_offtopic', 'label_needsmoderation',
        'label_negative'
    ]

    if label in label_range:
        file_cached = "./cache/df_r3.csv"

        try:
            df_aug = pd.read_csv(f'./output/trans_val_{label}.csv')
            X_aug = df_aug['text']
            y_aug = df_aug[label]
            X, y = pd.concat((X, X_aug)), pd.concat((
                y,
                y_aug,
            ))

        except FileNotFoundError as e:
            pass
        try:
            df_r3 = pd.read_csv(file_cached)

        except:
            df_r3 = loading.load_extended_posts(label=label)
            df_r3 = feature_engineering.add_column_ann_round(df_r3)
            df_r3 = feature_engineering.add_column_text(df_r3)
            df_r3 = df_r3.query('ann_round==3').copy()
            df_r3.to_csv(file_cached)

        df_r3 = feature_engineering.add_column_label_needsmoderation(df_r3)
        art_list = list(df_r3.id_article.unique())
        df_ann = pd.DataFrame(columns=df_r3.columns)

        for i in art_list:
            df_ann = pd.concat(
                (df_ann, df_r3.query(f'id_article=={i} and {label}==1').sample(
                    1, random_state=42)))

        return pd.concat((X, df_ann['text'])), pd.concat((y, df_ann[label]))

    else:
        print(
            f'Requested augmentation data for label {label} not available. Returned original X,y'
        )
        return X, y
Exemplo n.º 2
0
def create_splits():
    """
    Create test-train-val split of labeled data and save it to csv.
    Annotation round 2 and round 3 are split seperately.
    The csv files are written to ./data and contain the id_post.
    Args: None
    Returns: None
    """
    RSEED = 42
    df_posts = loading.load_extended_posts()
    df_posts = feature_engineering.add_column_ann_round(df_posts)

    # The first round of EDA showed, that only posts annotated in round 2 represent the population.
    # Posts annotated in round 3 will only be used for the labels "possiblyFeedback" and
    # "personalStories" (i.e. NaN in any of the other lables) and only in the training set.
    df_ann2 = df_posts.query("ann_round == 2")
    df_ann3_feedback_stories = df_posts.query(
        "ann_round == 3 and label_offtopic != label_offtopic")

    # Due to a small dataset (1,000 posts) we want to keep 100 observations for test and validation split each
    # We stratify by labels, that are least frequent in our 1,000 observations
    features_strat = [
        'label_discriminating', 'label_possiblyfeedback',
        'label_personalstories', 'label_sentimentpositive'
    ]
    ann2_train, ann2_test = train_test_split(df_ann2,
                                             stratify=df_ann2[features_strat],
                                             random_state=RSEED,
                                             test_size=100)
    ann2_train, ann2_val = train_test_split(
        ann2_train,
        stratify=ann2_train[features_strat],
        random_state=RSEED,
        test_size=100)

    df_train = pd.concat([ann2_train, df_ann3_feedback_stories], axis=0)

    print(f"Number of posts in train-set: {df_train.shape[0]}")
    print(f"Number of posts in val-set: {ann2_val.shape[0]}")
    print(f"Number of posts in test-set: {ann2_test.shape[0]}")
    df_train.id_post.to_csv('./data/ann2_train.csv', header=False)
    ann2_test.id_post.to_csv('./data/ann2_test.csv', header=False)
    ann2_val.id_post.to_csv('./data/ann2_val.csv', header=False)
    print('Splits created.')
Exemplo n.º 3
0
# # Eda issue 39 distribution of lables #39
# Issue link: https://github.com/dominikmn/one-million-posts/issues/39

# %%
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from utils import loading, feature_engineering

# %%
df_ann = loading.load_annotations()
df_ann = feature_engineering.add_column_ann_round(df_ann)
df_ann.head()

# %%
# prepare extended annotation dataframe with duplicates annotated with "all"
df_ann_ext = df_ann.copy()
df_ann_ext.ann_round = "all"
df_ann.ann_round = df_ann.ann_round.astype(str)
df_ann_ext = pd.concat([df_ann_ext, df_ann], axis=0)
df_ann_ext.shape

# %%
# annotation counts per value, annotation round, and category
df_ann_counts = df_ann_ext.groupby(
    ["category",
     "ann_round"])["value"].value_counts().unstack(level=2).unstack(level=1)
Exemplo n.º 4
0
        'Discriminating', 
        'ArgumentsUsed',
        'PersonalStories', 
        'SentimentPositive',
        'PossiblyFeedback', 
    ]

# %%
y_col_dict = {r:c for r,c in zip(y_col_grouped, y_col_grouped_clean)}

# %% [markdown]
# ## Scores Zero Shot

# %%
data = loading.load_extended_posts()
data = feature_engineering.add_column_ann_round(data)
data.fillna(value={'headline':'', 'body':''}, inplace=True)
data['text'] = data['headline']+" "+data['body']
data['text']=data.text.str.replace('\n',' ').str.replace('\r', ' ')
data_1000 = data.query('ann_round==2').copy()

# %%
data_pred_1000 = pd.read_csv('../output/zero_shot_result_1000.csv', index_col=0)
data_merge_1000 = pd.merge(data_1000, data_pred_1000, how='left', on = 'id_post', suffixes = ('_true', '_pred'))
scores_1000_05 = scoring.get_score_df(data_merge_1000)
scores_1000_best = scoring.get_score_df(data_merge_1000, best=True)

# %%
scores_zeroshot = scores_1000_05[['label', 'f1_pred']].query('label in @y_col').copy()
scores_zeroshot['model'] = pd.Series(['xlm-roberta-large-xnli']*9)
scores_zeroshot.rename(columns={'f1_pred':'f1_score'}, inplace=True)