for dev, test in zip(dev_list, test_list): dev_preds = np.zeros((len(dev), config["n_fold"])) test_preds = np.zeros((len(test), config["n_fold"])) dev_preds_list.append(dev_preds) test_preds_list.append(test_preds) for i in range(config["n_fold"]): if os.path.exists(config['output_dir']) and os.path.isdir( config['output_dir']): shutil.rmtree(config['output_dir']) print("Started Fold {}".format(i)) model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=config, num_labels=1, use_cuda=torch.cuda.is_available()) train_df, eval_df = train_test_split(train, test_size=0.2, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, mae=mean_absolute_error) model = ClassificationModel(MODEL_TYPE, config["best_model_dir"], args=config, num_labels=1, use_cuda=torch.cuda.is_available()) for dev_sentences, test_sentences, dev_preds, test_preds in zip( dev_sentences_list, test_sentences_list, dev_preds_list, test_preds_list):
print("--> dev preprocess tokenization done!") test = pd.read_csv(os.path.join("examples", "arabic", "data", "covid19_disinfo_binary_arabic_test_input.tsv"), sep='\t') dev_sentences = dev['text'].tolist() dev_preds = np.zeros((len(dev_sentences), config["n_fold"])) test['text'] = test['text'].apply(arabert_prep.preprocess) test_sentences = test['text'].tolist() test_preds = np.zeros((len(test_sentences), config["n_fold"])) for i in range(config["n_fold"]): if os.path.exists(config['output_dir']) and os.path.isdir(config['output_dir']): shutil.rmtree(config['output_dir']) print("Started Fold {}".format(i)) model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=config, use_cuda=torch.cuda.is_available()) train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, precision=precision, recall=recall, f1=f1) model = ClassificationModel(MODEL_TYPE, config["best_model_dir"], args=config, use_cuda=torch.cuda.is_available()) predictions, raw_outputs = model.predict(dev_sentences) dev_preds[:, i] = predictions test_predictions, test_raw_outputs = model.predict(test_sentences) test_preds[:, i] = test_predictions print("Completed Fold {}".format(i)) # select majority class of each instance (row) dev_predictions = []