if size_counter > 0:

    df_class_no_under = df_class_no.sample(count_class_yes * size_counter)
    print("under sized NOs : ", df_class_no_under['q3_label'].count())
    train = pd.concat([df_class_no_under, df_class_yes], axis=0)

else:

    df_class_yes_under = df_class_yes.sample(count_class_no * abs(size_counter))
    print("under sized YESs : ", df_class_yes_under['q3_label'].count())
    train = pd.concat([df_class_yes_under, df_class_no], axis=0)

train['labels'] = encode(train["q3_label"])
train = train[['text', 'labels']]
arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)
train['text'] = train['text'].apply(arabert_prep.preprocess)
print("--> train preprocess tokenization done!")

dev = pd.read_csv(os.path.join("examples", "arabic", "data", "covid19_disinfo_binary_arabic_dev_input.tsv"), sep='\t')

dev.dropna(subset=["q3_label"], inplace=True)
dev['labels'] = encode(dev["q3_label"])
dev = dev[['text', 'labels']]
train['text'] = train['text'].apply(arabert_prep.preprocess)
print("--> dev preprocess tokenization done!")

test = pd.read_csv(os.path.join("examples", "arabic", "data", "covid19_disinfo_binary_arabic_test_input.tsv"), sep='\t')

dev_sentences = dev['text'].tolist()
dev_preds = np.zeros((len(dev_sentences), config["n_fold"]))
Exemplo n.º 2
0
import torch
from sklearn.model_selection import train_test_split

from examples.nlp4if.arabic.arabert_preprocess import ArabertPreprocessor
from examples.nlp4if.arabic.bb_arabert_tokenized.arabic_bb_arabert_t_config import TEMP_DIRECTORY, MODEL_NAME, config, \
    MODEL_TYPE, SEED
from examples.nlp4if.common.converter import encode
from examples.nlp4if.common.evaluation import precision, recall, f1, confusion_matrix_values
from infominer.classification import ClassificationModel

if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY)

full = pd.read_csv(os.path.join("examples", "arabic", "data", "covid19_disinfo_binary_arabic_train.tsv"), sep='\t')
full['labels'] = encode(full["q1_label"])
full = full[['text', 'labels']]
arabert_prep = ArabertPreprocessor(model_name=MODEL_NAME)
full['text'] = arabert_prep.preprocess(full['text'])

train, dev = train_test_split(full, test_size=0.1, random_state=777)

dev_sentences = dev['text'].tolist()
dev_preds = np.zeros((len(dev_sentences), config["n_fold"]))

for i in range(config["n_fold"]):
    if os.path.exists(config['output_dir']) and os.path.isdir(config['output_dir']):
        shutil.rmtree(config['output_dir'])
    print("Started Fold {}".format(i))
    model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=config,
                                use_cuda=torch.cuda.is_available())
    train_df, eval_df = train_test_split(full, test_size=0.1, random_state=SEED * i)
    model.train_model(train_df, eval_df=eval_df, precision=precision, recall=recall, f1=f1)