def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratified = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE") xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
# Here we load GermEval 2018 Data. # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir="../data/germeval18", label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info(f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ],
def doc_classification_crossvalidation(): # the code for this function is partially taken from: # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py # for local logging: ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="covid-document-classification", run_name=RUNNAME) # model settings xval_folds = FOLDS set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) if RUNLOCAL: device = "cpu" n_epochs = NEPOCHS batch_size = BATCHSIZE evaluate_every = EVALEVERY lang_model = MODELTYPE do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) metric = "f1_macro" # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # The processor wants to know the possible labels ... label_list = LABELS processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=MAXLEN, data_dir=DATADIR, train_filename=TRAIN, test_filename=TEST, dev_split=0.1, label_list=label_list, metric=metric, label_column_name="Categories", # confusing parameter name: it should be called multiCLASS # not multiLABEL multilabel=True ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_macro = -1 save_dir = Path("saved_models/covid-classification-v1") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir, device) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device, ) result = evaluator_test.eval(model, return_preds_and_labels=True) os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold)) file.write(result[0]["report"]) file.write("\n\n") file.close() evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_macro = result[0]["f1_macro"] if f1_macro > bestf1_macro: bestf1_macro = f1_macro bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp: json.dump(allresults, fp, cls=NumpyArrayEncoder) # calculate overall f1 score across all folds xval_f1_macro = f1_score(all_labels, all_preds, average="macro") ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None) # test performance evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Final result of the best model \n") file.write(result[0]["report"]) file.write("\n\n") file.close() ml_logger.log_artifacts(BESTMODEL + "/") # save model for later use processor.save(BESTMODEL) model.save(BESTMODEL) return model
def test_data_silo_for_cross_val_nested(): lang_model = "bert-base-german-cased" n_outer_splits = 3 n_inner_splits = 3 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("data/germeval18"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=32) silos = DataSiloForCrossVal.make( data_silo, sets=['test', 'train'], n_splits=n_outer_splits, n_inner_splits=n_inner_splits, ) # check number of silos assert len(silos) == (n_outer_splits * n_inner_splits) # because the outer cross validation creates the test set it must be the same # in silo 0 and silo 1 data_loader_test_indices_0 = silos[0].get_data_loader( 'test').dataset.indices data_loader_test_indices_1 = silos[1].get_data_loader( 'test').dataset.indices assert data_loader_test_indices_0.size > 0 assert data_loader_test_indices_1.size > 0 assert data_loader_test_indices_0.ndim == 1 assert data_loader_test_indices_1.ndim == 1 assert np.array_equal(data_loader_test_indices_0, data_loader_test_indices_1) # because the inner cross validation creates the dev set it must be different # in silo 0 and silo 1 data_loader_dev_indices_0 = silos[0].get_data_loader('dev').dataset.indices data_loader_dev_indices_1 = silos[1].get_data_loader('dev').dataset.indices assert data_loader_dev_indices_0.size > 0 assert data_loader_dev_indices_1.size > 0 assert data_loader_dev_indices_0.ndim == 1 assert data_loader_dev_indices_1.ndim == 1 assert not np.array_equal(data_loader_dev_indices_0, data_loader_dev_indices_1) # extract and test train sets of silo 0 and 1 data_loader_train_indices_0 = silos[0].get_data_loader( 'train').dataset.indices data_loader_train_indices_1 = silos[1].get_data_loader( 'train').dataset.indices assert data_loader_train_indices_0.size > 0 assert data_loader_train_indices_1.size > 0 assert data_loader_train_indices_0.ndim == 1 assert data_loader_train_indices_1.ndim == 1 # size of dev + train + test must be same on all folds assert (data_loader_train_indices_0.size + \ data_loader_dev_indices_0.size + \ data_loader_test_indices_0.size) == \ (data_loader_train_indices_1.size + \ data_loader_dev_indices_1.size + \ data_loader_test_indices_1.size)
def question_answering_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base") ########################## ########## Settings ########################## save_per_fold_results = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = True n_epochs = 3 batch_size = 24 learning_rate = 3e-5 data_dir = Path("../data/squad20") filename = "dev-v2.0.json" xval_folds = 4 dev_split = 0.1 evaluate_every = 50 no_ans_boost = 0 # use large negative values to disable giving "no answer" option recall_at = 3 # recall at n is only useful for answers inside long documents use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=filename, dev_filename=None, dev_split=dev_split, test_filename=None, data_dir=data_dir, doc_stride=128, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # fine-tune pre-trained question-answering model model = AdaptiveModel.convert_from_transformers( lang_model, device, "question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # If positive, thjs will boost "No Answer" as prediction. # If negative, this will prevent the model from giving "No Answer" as prediction. model.prediction_heads[0].no_ans_boost = no_ans_boost # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = recall_at # # or train question-answering models from scratch # # Create an AdaptiveModel # # a) which consists of a pretrained language model as a basis # language_model = LanguageModel.load(lang_model) # # b) and a prediction head on top that is suited for our task => Question-answering # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=recall_at) # model = AdaptiveModel( # language_model=language_model, # prediction_heads=[prediction_head], # embeds_dropout_prob=0.1, # lm_output_types=["per_token"], # device=device,) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, then evaluate the model on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging all_results = [] all_preds = [] all_labels = [] all_f1 = [] all_em = [] all_topnrecall = [] for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", logging=False, steps=len(silo.get_data_loader("test")), num_fold=num_fold) all_results.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) all_f1.append(result[0]["f1"]) all_em.append(result[0]["EM"]) all_topnrecall.append(result[0]["top_n_recall"]) # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis if save_per_fold_results: def convert_numpy_dtype(obj): if type(obj).__module__ == "numpy": return obj.item() raise TypeError("Unknown type:", type(obj)) with open("qa_xval.results.json", "wt") as fp: json.dump(all_results, fp, default=convert_numpy_dtype) # calculate overall metrics across all folds xval_score = squad(preds=all_preds, labels=all_labels) logger.info(f"Single EM-Scores: {all_em}") logger.info(f"Single F1-Scores: {all_f1}") logger.info(f"Single top_{recall_at}_recall Scores: {all_topnrecall}") logger.info(f"XVAL EM: {xval_score['EM']}") logger.info(f"XVAL f1: {xval_score['f1']}") logger.info(f"XVAL top_{recall_at}_recall: {xval_score['top_n_recall']}") ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0) ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0) ml_logger.log_metrics( {f"XVAL top_{recall_at}_recall": xval_score["top_n_recall"]}, 0)