def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="micro")
    xval_f1_macro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="macro")
    xval_f1_offense = f1_score(all_labels,
                               all_preds,
                               labels=label_list,
                               pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])
# Here we load GermEval 2018 Data.

# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/germeval18",
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="coarse_label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# Load one silo for each fold in our cross-validation
silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)


# the following steps should be run for each of the folds of the cross validation, so we put them
# into a function
def train_on_split(silo_to_use, n_fold, save_dir):
    logger.info(f"############ Crossvalidation: Fold {n_fold} ############")
    # Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[
            768,
            len(processor.tasks["text_classification"]["label_list"])
        ],
def doc_classification_crossvalidation():
    # the code for this function is partially taken from:
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py

    # for local logging:
    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="covid-document-classification",
                              run_name=RUNNAME)

    # model settings
    xval_folds = FOLDS
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    if RUNLOCAL:
        device = "cpu"
    n_epochs = NEPOCHS
    batch_size = BATCHSIZE
    evaluate_every = EVALEVERY
    lang_model = MODELTYPE
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    metric = "f1_macro"

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # The processor wants to know the possible labels ...
    label_list = LABELS
    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=MAXLEN,
                                            data_dir=DATADIR,
                                            train_filename=TRAIN,
                                            test_filename=TEST,
                                            dev_split=0.1,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="Categories",
                                            # confusing parameter name: it should be called multiCLASS
                                            # not multiLABEL
                                            multilabel=True
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir, dev):
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = MultiLabelTextClassificationHead(
            # there is still an error with class weights ...
            # class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.2,
            lm_output_types=["per_sequence"],
            device=dev)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=dev,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        # unfortunately, early stopping is still not working
        earlystopping = EarlyStopping(
            metric="f1_macro", mode="max",
            save_dir=save_dir,  # where to save the best model
            patience=5 # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model, optimizer=optimizer,
                          data_silo=silo_to_use, epochs=n_epochs,
                          n_gpu=n_gpu, lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=dev, evaluator_test=False,
                          #early_stopping=earlystopping)
                          )
        # train it
        trainer.train()
        trainer.model.save(save_dir)
        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_macro = -1
    save_dir = Path("saved_models/covid-classification-v1")

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir, device)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(
            data_loader=silo.get_data_loader("test"),
            tasks=silo.processor.tasks,
            device=device,
        )
        result = evaluator_test.eval(model, return_preds_and_labels=True)

        os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True)
        with open(BESTMODEL + "/classification_report.txt", "a+") as file:
            file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold))
            file.write(result[0]["report"])
            file.write("\n\n")
            file.close()

        evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_macro = result[0]["f1_macro"]
        if f1_macro > bestf1_macro:
            bestf1_macro = f1_macro
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp:
        json.dump(allresults, fp, cls=NumpyArrayEncoder)

    # calculate overall f1 score across all folds
    xval_f1_macro = f1_score(all_labels, all_preds, average="macro")
    ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None)

    # test performance
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    result = evaluator_origtest.eval(model)
    ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None)

    with open(BESTMODEL + "/classification_report.txt", "a+") as file:
        file.write("Final result of the best model \n")
        file.write(result[0]["report"])
        file.write("\n\n")
        file.close()

    ml_logger.log_artifacts(BESTMODEL + "/")

    # save model for later use
    processor.save(BESTMODEL)
    model.save(BESTMODEL)
    return model
示例#4
0
def test_data_silo_for_cross_val_nested():
    lang_model = "bert-base-german-cased"
    n_outer_splits = 3
    n_inner_splits = 3

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("data/germeval18"),
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=32)

    silos = DataSiloForCrossVal.make(
        data_silo,
        sets=['test', 'train'],
        n_splits=n_outer_splits,
        n_inner_splits=n_inner_splits,
    )

    # check number of silos
    assert len(silos) == (n_outer_splits * n_inner_splits)

    # because the outer cross validation creates the test set it must be the same
    # in silo 0 and silo 1
    data_loader_test_indices_0 = silos[0].get_data_loader(
        'test').dataset.indices
    data_loader_test_indices_1 = silos[1].get_data_loader(
        'test').dataset.indices
    assert data_loader_test_indices_0.size > 0
    assert data_loader_test_indices_1.size > 0
    assert data_loader_test_indices_0.ndim == 1
    assert data_loader_test_indices_1.ndim == 1
    assert np.array_equal(data_loader_test_indices_0,
                          data_loader_test_indices_1)

    # because the inner cross validation creates the dev set it must be different
    # in silo 0 and silo 1
    data_loader_dev_indices_0 = silos[0].get_data_loader('dev').dataset.indices
    data_loader_dev_indices_1 = silos[1].get_data_loader('dev').dataset.indices
    assert data_loader_dev_indices_0.size > 0
    assert data_loader_dev_indices_1.size > 0
    assert data_loader_dev_indices_0.ndim == 1
    assert data_loader_dev_indices_1.ndim == 1
    assert not np.array_equal(data_loader_dev_indices_0,
                              data_loader_dev_indices_1)

    # extract and test train sets of silo 0 and 1
    data_loader_train_indices_0 = silos[0].get_data_loader(
        'train').dataset.indices
    data_loader_train_indices_1 = silos[1].get_data_loader(
        'train').dataset.indices
    assert data_loader_train_indices_0.size > 0
    assert data_loader_train_indices_1.size > 0
    assert data_loader_train_indices_0.ndim == 1
    assert data_loader_train_indices_1.ndim == 1

    # size of dev + train + test must be same on all folds
    assert (data_loader_train_indices_0.size + \
           data_loader_dev_indices_0.size + \
           data_loader_test_indices_0.size) == \
           (data_loader_train_indices_1.size + \
           data_loader_dev_indices_1.size + \
           data_loader_test_indices_1.size)
def question_answering_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base")

    ##########################
    ########## Settings
    ##########################
    save_per_fold_results = True
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = True

    n_epochs = 3
    batch_size = 24
    learning_rate = 3e-5

    data_dir = Path("../data/squad20")
    filename = "dev-v2.0.json"
    xval_folds = 4
    dev_split = 0.1
    evaluate_every = 50
    no_ans_boost = 0  # use large negative values to disable giving "no answer" option
    recall_at = 3  # recall at n is only useful for answers inside long documents
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=filename,
        dev_filename=None,
        dev_split=dev_split,
        test_filename=None,
        data_dir=data_dir,
        doc_stride=128,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")

        # fine-tune pre-trained question-answering model
        model = AdaptiveModel.convert_from_transformers(
            lang_model, device, "question_answering")
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)
        # If positive, thjs will boost "No Answer" as prediction.
        # If negative, this will prevent the model from giving "No Answer" as prediction.
        model.prediction_heads[0].no_ans_boost = no_ans_boost
        # Number of predictions the model will make per Question.
        # The multiple predictions are used for evaluating top n recall.
        model.prediction_heads[0].n_best = recall_at

        # # or train question-answering models from scratch
        # # Create an AdaptiveModel
        # # a) which consists of a pretrained language model as a basis
        # language_model = LanguageModel.load(lang_model)
        # # b) and a prediction head on top that is suited for our task => Question-answering
        # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=recall_at)
        # model = AdaptiveModel(
        #    language_model=language_model,
        #    prediction_heads=[prediction_head],
        #    embeds_dropout_prob=0.1,
        #    lm_output_types=["per_token"],
        #    device=device,)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=learning_rate,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, then evaluate the model on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    all_results = []
    all_preds = []
    all_labels = []
    all_f1 = []
    all_em = []
    all_topnrecall = []

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold)

        # do eval on test set here (and not in Trainer),
        # so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   logging=False,
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        all_results.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))
        all_f1.append(result[0]["f1"])
        all_em.append(result[0]["EM"])
        all_topnrecall.append(result[0]["top_n_recall"])

        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    if save_per_fold_results:

        def convert_numpy_dtype(obj):
            if type(obj).__module__ == "numpy":
                return obj.item()

            raise TypeError("Unknown type:", type(obj))

        with open("qa_xval.results.json", "wt") as fp:
            json.dump(all_results, fp, default=convert_numpy_dtype)

    # calculate overall metrics across all folds
    xval_score = squad(preds=all_preds, labels=all_labels)

    logger.info(f"Single EM-Scores:   {all_em}")
    logger.info(f"Single F1-Scores:   {all_f1}")
    logger.info(f"Single top_{recall_at}_recall Scores:   {all_topnrecall}")
    logger.info(f"XVAL EM:   {xval_score['EM']}")
    logger.info(f"XVAL f1:   {xval_score['f1']}")
    logger.info(f"XVAL top_{recall_at}_recall:   {xval_score['top_n_recall']}")
    ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0)
    ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0)
    ml_logger.log_metrics(
        {f"XVAL top_{recall_at}_recall": xval_score["top_n_recall"]}, 0)