def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratified = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE") xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info("###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = "saved_models/bert-german-doc-tutorial-es-{}".format(bestfold) model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
def convert_to_onnx(self, output_path, opset_version=11, optimize_for=None): """ Convert a PyTorch AdaptiveModel to ONNX. The conversion is trace-based by performing a forward pass on the model with a input batch. :param output_path: model dir to write the model and config files :type output_path: Path :param opset_version: ONNX opset version :type opset_version: int :param optimize_for: optimize the exported model for a target device. Available options are "gpu_tensor_core" (GPUs with tensor core like V100 or T4), "gpu_without_tensor_core" (most other GPUs), and "cpu". :type optimize_for: str :return: """ if type(self.prediction_heads[0]) is not QuestionAnsweringHead: raise NotImplementedError tokenizer = Tokenizer.load( pretrained_model_name_or_path="deepset/bert-base-cased-squad2") label_list = ["start_token", "end_token"] metric = "squad" max_seq_len = 384 batch_size = 1 processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename= "stub-file", # the data is loaded from dicts instead of file. dev_filename=None, test_filename=None, data_dir="stub-dir", ) data_silo = DataSilo(processor=processor, batch_size=1, distributed=False, automatic_loading=False) sample_dict = [{ "context": 'The Normans were the people who in the 10th and 11th centuries gave their name to Normandy, ' 'a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders ' 'and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear ' 'fealty to King Charles III of West Francia.', "qas": [{ "question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{ "text": "France", "answer_start": 159 }], "is_impossible": False, }], }] data_silo._load_data(train_dicts=sample_dict) data_loader = data_silo.get_data_loader("train") data = next(iter(data_loader)) data = list(data.values()) inputs = { 'input_ids': data[0].to(self.device).reshape(batch_size, max_seq_len), 'padding_mask': data[1].to(self.device).reshape(batch_size, max_seq_len), 'segment_ids': data[2].to(self.device).reshape(batch_size, max_seq_len) } # The method argument passing in torch.onnx.export is different to AdaptiveModel's forward(). # To resolve that, an ONNXWrapper instance is used. model = ONNXWrapper.load_from_adaptive_model(self) if not os.path.exists(output_path): os.makedirs(output_path) with torch.no_grad(): symbolic_names = {0: 'batch_size', 1: 'max_seq_len'} torch.onnx.export( model, args=tuple(inputs.values()), f=output_path / 'model.onnx'.format(opset_version), opset_version=opset_version, do_constant_folding=True, input_names=['input_ids', 'padding_mask', 'segment_ids'], output_names=['logits'], dynamic_axes={ 'input_ids': symbolic_names, 'padding_mask': symbolic_names, 'segment_ids': symbolic_names, 'logits': symbolic_names, }) if optimize_for: optimize_args = Namespace(disable_attention=False, disable_bias_gelu=False, disable_embed_layer_norm=False, opt_level=99, disable_skip_layer_norm=False, disable_bias_skip_layer_norm=False, hidden_size=768, verbose=False, input='onnx-export/model.onnx', model_type='bert', num_heads=12, output='onnx-export/model.onnx') if optimize_for == "gpu_tensor_core": optimize_args.float16 = True optimize_args.input_int32 = True elif optimize_for == "gpu_without_tensor_core": optimize_args.float16 = False optimize_args.input_int32 = True elif optimize_for == "cpu": logger.info("") optimize_args.float16 = False optimize_args.input_int32 = False else: raise NotImplementedError( f"ONNXRuntime model optimization is not available for {optimize_for}. Choose " f"one of 'gpu_tensor_core'(V100 or T4), 'gpu_without_tensor_core' or 'cpu'." ) optimize_onnx_model(optimize_args) else: logger.info( "Exporting unoptimized ONNX model. To enable optimization, supply " "'optimize_for' parameter with the target device.'") # PredictionHead contains functionalities like logits_to_preds() that would still be needed # for Inference with ONNX models. Only the config of the PredictionHead is stored. for i, ph in enumerate(self.prediction_heads): ph.save_config(output_path, i) processor.save(output_path) onnx_model_config = { "onnx_opset_version": opset_version, "language": self.get_language(), } with open(output_path / "model_config.json", "w") as f: json.dump(onnx_model_config, f) logger.info(f"Model exported at path {output_path}")
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 32 * 4 # 4x V100 n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 77.714 gold_tnrecall = 97.3721 # gold_elapsed = 1286.30 np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 1 recall by: {em_score - gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly: {elapsed - gold_elapsed}")
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = True data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=50) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 77.7478 gold_f1 = 82.1557 gold_tnrecall = 84.0646 # top 1 recall gold_elapsed = 78 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.001, err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly: {elapsed - gold_elapsed}") # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=50, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename) elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=result, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 78.489 gold_f1 = 81.7104 gold_elapsed = 74 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly: {elapsed - gold_elapsed}")
def question_answering_confidence(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = False batch_size = 80 data_dir = Path("../data/squad20") # We use the same file for dev and test set only for demo purposes dev_filename = "dev-v2.0.json" test_filename = "dev-v2.0.json" accuracy_at = 3 # accuracy at n is useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=dev_filename, test_filename=test_filename, data_dir=data_dir, doc_stride=192, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Load pre-trained question-answering model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Number of predictions the model will make per Question. # The multiple predictions are used for evaluating top n recall. model.prediction_heads[0].n_best = accuracy_at # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head. # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature. # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1]. # A temperature larger than 1 decreases the model’s confidence scores. logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}") # 6a. We can either manually set the temperature (default value is 1.0)... model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device)) # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling. # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter. # During the calibration, this parameter is automatically set internally as an attribute of the prediction head. evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device ) result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True) # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev"))) # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0] logger.info("Grouping predictions by confidence score and calculating metrics for each bin.") em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10) for bin_number in range(10): logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}") # 8. Hooray! You have a model with calibrated confidence scores. # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head. save_dir = Path("../saved_models/qa-confidence-tutorial") model.save(save_dir) processor.save(save_dir) # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough # To this end, load the stored model, which will automatically load the stored temperature parameter. # The confidence scores are automatically adjusted based on this temperature parameter. # For each prediction, we can check the model's confidence and decide whether to output the prediction or not. inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True) logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}") QA_input = [ { "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0] if result.prediction[0].confidence > 0.9: print(result.prediction[0].answer) else: print("The confidence is not high enough to give an answer.")
def evaluate_question_answering(): ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = "deepset/roberta-base-squad2" do_lower_case = True data_dir = Path("../data/squad20") evaluation_filename = "dev-v2.0.json" batch_size = 50 no_ans_boost = 0 recall_at = 3 # recall at n is only useful for answers inside long documents # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list= ["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an Evaluator evaluator = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # 5. Load model model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") # use "load" if you want to use a local model that was trained with FARM #model = AdaptiveModel.load(lang_model, device=device) model.prediction_heads[0].no_ans_boost = no_ans_boost model.prediction_heads[0].n_best = recall_at model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # 6. Run the Evaluator results = evaluator.eval(model) f1_score = results[0]["f1"] em_score = results[0]["EM"] tnrecall = results[0]["top_n_recall"] print("F1-Score:", f1_score) print("Exact Match Score:", em_score) print(f"top_{recall_at}_recall:", tnrecall)
def doc_classification_holdout(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # local logging into directory "logs" mlflogger = MLFlowLogger(tracking_uri="logs") mlflogger.init_experiment(experiment_name="Example-docclass-xval", run_name="testrun1") ########################## ########## Settings ########################## holdout_splits = 5 holdout_train_split = 0.8 holdout_stratification = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" dev_split = 0.1 # For holdout the dev_stratification parameter must not be None: with None, the devset cannot be created # using the default method of only splitting by the available chunks as initial train set for each fold # is just a single chunk! dev_stratification = True do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, dev_split=dev_split, dev_stratification=dev_stratification, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForHoldout.make(data_silo, sets=["train", "dev"], n_splits=holdout_splits, train_split=holdout_train_split, stratification=holdout_stratification) # the following steps should be run for each of the folds of the holdout evaluation, so we put them # into a function def train_on_split(silo_to_use, n_eval, save_dir): logger.info( f"############ Holdout: Evaluation {n_eval} of {holdout_splits} ############" ) logger.info( f"Fold training samples: {len(silo_to_use.data['train'])}") logger.info(f"Fold dev samples: {len(silo_to_use.data['dev'])}") logger.info( f"Fold testing samples: {len(silo_to_use.data['test'])}") logger.info( "Total number of samples: " f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}" ) # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_eval}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # remember all individual evaluation results allresults = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): mlflow.start_run(run_name=f"split-{num_fold + 1}-of-{len(silos)}", nested=True) model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold mlflow.end_run() # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_holdout.results.json", "wt") as fp: json.dump(allresults, fp) # log the best fold metric and fold logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}") # calculate overall metrics across all folds: we only have one head so we do this only for the first head # information in each of the per-fold results # First create a dict where for each metric, we have a list of values from each fold eval_metric_lists_head0 = defaultdict(list) for results in allresults: head0results = results[0] for name in head0results.keys(): if name not in ["preds", "labels"] and not name.startswith("_") and \ isinstance(head0results[name], numbers.Number): eval_metric_lists_head0[name].append(head0results[name]) # Now calculate the mean and stdev for each metric, also copy over the task name eval_metric = {} eval_metric["task_name"] = allresults[0][0].get("task_name", "UNKNOWN TASKNAME") for name in eval_metric_lists_head0.keys(): values = eval_metric_lists_head0[name] vmean = statistics.mean(values) vstdev = statistics.stdev(values) eval_metric[name + "_mean"] = vmean eval_metric[name + "_stdev"] = vstdev logger.info( f"HOLDOUT Accuracy: mean {eval_metric['acc_mean']} stdev {eval_metric['acc_stdev']}" ) logger.info( f"HOLDOUT F1 MICRO: mean {eval_metric['f1_micro_mean']} stdev {eval_metric['f1_micro_stdev']}" ) logger.info( f"HOLDOUT F1 MACRO: mean {eval_metric['f1_macro_mean']} stdev {eval_metric['f1_macro_stdev']}" ) logger.info( f"HOLDOUT F1 OFFENSE: mean {eval_metric['f1_offense_mean']} stdev {eval_metric['f1_offense_stdev']}" ) logger.info( f"HOLDOUT F1 OTHER: mean {eval_metric['f1_other_mean']} stdev {eval_metric['f1_other_stdev']}" ) logger.info( f"HOLDOUT MCC: mean {eval_metric['mcc_mean']} stdev {eval_metric['mcc_stdev']}" ) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info(f"TEST Accuracy: {result[0]['acc']}") logger.info(f"TEST F1 MICRO: {result[0]['f1_micro']}") logger.info(f"TEST F1 MACRO: {result[0]['f1_macro']}") logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}") logger.info(f"TEST F1 OTHER: {result[0]['f1_other']}") logger.info(f"TEST MCC: {result[0]['mcc']}")
def dense_passage_retrieval(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval", run_name="Run_dpr_enocder") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 2 n_epochs = 3 evaluate_every = 1000 question_lang_model = "facebook/dpr-question_encoder-single-nq-base" passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base" do_lower_case = True use_fast = True embed_title = True num_hard_negatives = 1 similarity_function = "dot_product" train_filename = "nq-train.json" dev_filename = "nq-dev.json" max_samples = None #load a smaller dataset (e.g. for debugging) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=question_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) context_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json label_list = ["hard_negative", "positive"] metric = "text_similarity_metric" processor = TextSimilarityProcessor(tokenizer=query_tokenizer, passage_tokenizer=context_tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="data/retriever", train_filename=train_filename, dev_filename=dev_filename, test_filename=dev_filename, embed_title=embed_title, num_hard_negatives=num_hard_negatives, max_samples=max_samples) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel+ # a) which consists of a pretrained language model as a basis question_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder") passage_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder") # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = TextSimilarityHead( similarity_function=similarity_function) model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \ "eps": 1e-08}, schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=1, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/dpr-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Evaluate test_data_loader = data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator(data_loader=test_data_loader, tasks=data_silo.processor.tasks, device=device) model.connect_heads_with_processor(processor.tasks) test_result = evaluator_test.eval(model)
def doc_classification(task_config, model_name_or_path, cache_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = Path(task_config["data"]["data_dir"]) save_dir = utils.init_save_dir(task_config["output_dir"], task_config["experiment_name"], run_name, tune.session.get_trial_name() if do_hpo else None) # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config[ "data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"] ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment(experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights(task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = {"name": "LinearWarmup", "num_warmup_steps": warmup_steps} model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping( mode=early_stopping_mode, min_delta=0.0001, save_dir=save_dir, metric=early_stopping_metric, patience=early_stopping_patience ) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval ) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(save_dir / "final_model") processor.save(save_dir / "final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = save_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir / "eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device ) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir / "eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def doc_classification_crossvalidation(): # the code for this function is partially taken from: # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py # for local logging: ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="covid-document-classification", run_name=RUNNAME) # model settings xval_folds = FOLDS set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) if RUNLOCAL: device = "cpu" n_epochs = NEPOCHS batch_size = BATCHSIZE evaluate_every = EVALEVERY lang_model = MODELTYPE do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) metric = "f1_macro" # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # The processor wants to know the possible labels ... label_list = LABELS processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=MAXLEN, data_dir=DATADIR, train_filename=TRAIN, test_filename=TEST, dev_split=0.1, label_list=label_list, metric=metric, label_column_name="Categories", # confusing parameter name: it should be called multiCLASS # not multiLABEL multilabel=True ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_macro = -1 save_dir = Path("saved_models/covid-classification-v1") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir, device) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device, ) result = evaluator_test.eval(model, return_preds_and_labels=True) os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold)) file.write(result[0]["report"]) file.write("\n\n") file.close() evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_macro = result[0]["f1_macro"] if f1_macro > bestf1_macro: bestf1_macro = f1_macro bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp: json.dump(allresults, fp, cls=NumpyArrayEncoder) # calculate overall f1 score across all folds xval_f1_macro = f1_score(all_labels, all_preds, average="macro") ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None) # test performance evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None) with open(BESTMODEL + "/classification_report.txt", "a+") as file: file.write("Final result of the best model \n") file.write(result[0]["report"]) file.write("\n\n") file.close() ml_logger.log_artifacts(BESTMODEL + "/") # save model for later use processor.save(BESTMODEL) model.save(BESTMODEL) return model
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = False data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 model.prediction_heads[0].n_best_per_sample = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=40 * n_gpu_factor) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None, distributed=False, use_amp=None) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 78.4721 gold_f1 = 82.6671 gold_tnacc = 84.3594 # top 1 recall gold_elapsed = 40 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnacc, gold_tnacc, rtol=0.001, err_msg= f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.001): error_messages.append( f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.001): error_messages.append( f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnacc, rtol=0.001): error_messages.append( f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "FARM internal evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnacc, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnacc, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result[0]) + "\n") # # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40 * n_gpu_factor, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename, return_json=False, multiprocessing_chunksize=80) results_squad = [x.to_squad_eval() for x in result] model.close_multiprocessing_pool() elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=results_squad, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 79.878 gold_f1 = 82.917 gold_elapsed = 27 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.001): error_messages.append( f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.001): error_messages.append( f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds" ) benchmark_result.append({ "run": "outside eval script", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": "-", "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": "-", "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": "-", "elapsed_gold": gold_elapsed }) logger.info("\n\n" + pformat(benchmark_result[1]) + "\n") return benchmark_result
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others batch_size = 40 * n_gpu_factor n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model test_assertions = False train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 78.6575 #77.714 gold_tnrecall = 97.3721 gold_elapsed = 1135 if test_assertions: np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg= f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnacc, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.01): error_messages.append( f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.01): error_messages.append( f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnrecall, rtol=0.01): error_messages.append( f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "train evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnrecall, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnrecall, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result) + "\n") return benchmark_result
def finetune_token_regression(args): logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO ) args.logger = logging.getLogger(__name__) if args.do_logfile: filehandler = logging.FileHandler(os.path.join(args.log_dir, f"{args.run_name}.log")) args.logger.addHandler(filehandler) args.logger.info(vars(args)) # Setup MLFlow ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) set_all_seeds(seed=args.seed) args.device, args.n_gpu = initialize_device_settings(use_cuda=True) # Create a tokenizer tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name, do_lower_case=args.do_lower_case, tokenizer_class=tok_class ) # Create a processor for the dataset # Only token-level regression supported for now processor = TokenRegressionProcessor( tokenizer=tokenizer, max_seq_len=args.max_seq_length, data_dir=args.data_dir, label_column_names=args.label_columns, label_names=args.label_columns, dev_split=args.dev_split, ) # Create a DataSilo that loads several datasets (train/dev/test) # provides DataLoaders and calculates descriptive statistics data_silo = DataSilo(processor=processor, batch_size=args.batch_size) if args.folds > 1: evaluate_kfold(args, data_silo, processor) else: adapt_model = train_on_split(args, data_silo, processor) evaluator_test = MultitaskEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=args.device, report=False, ) result = evaluator_test.eval(adapt_model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(data_silo.get_data_loader("test"))) pred_tsv = pd.DataFrame() args.logger.info("Test results:") for res in result[1:]: args.logger.info(f"__{res['task_name']}__") metrics = token_level_regression_metrics(res.get("preds"), res.get("labels")) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}") if args.save_predictions: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds") pred_tsv[f"{res['task_name']}_labels"] = res.get("labels") if args.save_predictions: save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}.tsv")) # Load trained model and perform inference dicts = [ {"text": "The intense interest aroused in the public has now somewhat subsided."}, {"text": "The quick brown fox jumped over the lazy dog."}, ] model = MultitaskInferencer.load(args.save_dir, gpu=True, level="token") result = model.inference_from_dicts(dicts=dicts) args.logger.info("Inference example:") args.logger.info(result)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_fasttext") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load fasttext from a local path: #fasttext_model = "../saved_models/fasttext-german-uncased" # or through s3 fasttext_model = "fasttext-german-uncased" do_lower_case = True max_features = 10_000 # maximum number of unique words we will transform device, n_gpu = initialize_device_settings(use_cuda=True) # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings ft_converter = Fasttext_converter( pretrained_model_name_or_path=fasttext_model, do_lower_case=do_lower_case, data_path=Path("../data/germeval18"), train_filename="train.tsv", output_path=Path("../saved_models/fasttext-german-uncased-converted"), language="German", max_features=max_features) # We convert the data to have fixed size vocab and embeddings vocab_counts = ft_converter.convert_on_data() # 2. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case) # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=ft_converter.data_path, label_list=label_list, train_filename=ft_converter.train_filename, dev_split=0, test_filename="test.tsv", metric=metric, label_column_name="coarse_label") # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size, max_processes=1 ) # multiprocessing with WordembeddingTokenizer is not optimal - so disable it # 5. Create an AdaptiveModel # a) which consists of the newly created embedding model as a basis. language_model = LanguageModel.load(ft_converter.output_path) # b) and a prediction head on top that is suited for our task => Text classification # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN # for going the Classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 6. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-3, device=device, n_batches=len( data_silo.get_data_loader("train") ), #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train")) n_epochs=n_epochs) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 8. Let it grow trainer.train()
processor=processor, batch_size=4, gpu=True, # TODO: how to mix for multihead? task_type="classification") basic_texts = [ { "text": "Some text you want to classify" }, { "text": "A second sample" }, ] ret = inferencer.inference_from_dicts(basic_texts) logger.info(f"Result of inference: {ret}") logger.info(f"Evaluating on training set...") evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"), tasks=processor.tasks, device=device) result = evaluator.eval(inferencer.model, return_preds_and_labels=True) evaluator.log_results(result, "Test", steps=len(data_silo.get_data_loader("test"))) inferencer.close_multiprocessing_pool() logger.info("PROCESSING FINISHED")