def __init__( self, tokenizer, max_seq_len, data_dir, label_list=None, metric=None, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", quote_char=csv.QUOTE_NONE, skiprows=None, label_column_names=[], label_names=[], multilabel=False, header=0, proxies=None, max_samples=None, text_column_name="text", **kwargs, ): self.delimiter = delimiter self.quote_char = quote_char self.skiprows = skiprows self.header = header self.max_samples = max_samples self.text_column_name = text_column_name super(TextClassificationProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric is None: metric = "classification_metrics" register_metrics(metric, classification_metrics) if multilabel: task_type = 'multilabel_classification' else: task_type = "classification" data = read_tsv(os.path.join(data_dir, train_filename)) if label_column_names and label_names: for col_name, l_name in zip(label_column_names, label_names): self.add_task( name=l_name, metric=metric, label_list=list(set(data[col_name])), label_column_name=col_name, task_type=task_type, label_name=l_name, )
def register_task_metrics(label_list): register_metrics('binary_classification_metrics', binary_classification_metrics) register_metrics('multiclass_classification_metrics', multiclass_classification_metrics) register_multilabel_classification_metrics_3_digits_only( 'binary_classification_metrics_3_digits_only', label_list) register_multilabel_classification_metrics_i2b2_only( 'binary_classification_metrics_i2b2_only', label_list)
def register_multilabel_classification_metrics_i2b2_only( metric_name, label_list): def multilabel_classification_metrics_i2b2_only(preds, probs, labels, multilabel): mask = list(map(utils.is_i2b2_code, label_list)) logger.info(f"Evaluate on {mask.count(True)} i2b2 codes.") return binary_classification_metrics(preds[:, mask], [prob[mask] for prob in probs], labels[:, mask], multilabel) register_metrics(metric_name, multilabel_classification_metrics_i2b2_only)
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
sep='\t', index=False) eval_df.to_csv(os.path.join(multitransquest_config['cache_dir'], "eval.tsv"), header=True, sep='\t', index=False) set_all_seeds(seed=SEED * i) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = multitransquest_config['num_train_epochs'] batch_size = multitransquest_config['train_batch_size'] evaluate_every = multitransquest_config['evaluate_during_training_steps'] lang_model = MODEL_NAME register_metrics(name="pearson_correlation", implementation=pearson_corr) tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextPairRegressionProcessor( tokenizer=tokenizer, label_list=None, metric="pearson_correlation", max_seq_len=multitransquest_config['max_seq_length'], train_filename="train.tsv", dev_filename="eval.tsv", test_filename=None, data_dir=Path(multitransquest_config['cache_dir']), delimiter="\t") data_silo = DataSilo(processor=processor, batch_size=batch_size)
def perform_fine_tuning(current_info_need, bert_model, label_list, num_epochs, condition, folds=10, stratified=True, learning_rate=2e-5, batch_size=32, embeds_dropout_prob=.1): ## Define evaluation metrics ## def evaluation_metrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other") f1infoneed = f1_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_infoneed = recall_score(y_true=labels, y_pred=preds, pos_label=current_info_need) precision_infoneed = precision_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_other = recall_score(y_true=labels, y_pred=preds, pos_label="Other") precision_other = precision_score(y_true=labels, y_pred=preds, pos_label="Other") recall_macro = recall_score(y_true=labels, y_pred=preds, average="macro") precision_macro = precision_score(y_true=labels, y_pred=preds, average="macro") recall_micro = recall_score(y_true=labels, y_pred=preds, average="micro") precision_micro = precision_score(y_true=labels, y_pred=preds, average="micro") recall_weighted = recall_score(y_true=labels, y_pred=preds, average="weighted") precision_weighted = precision_score(y_true=labels, y_pred=preds, average="weighted") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) f1weighted = f1_score(y_true=labels, y_pred=preds, average="weighted") return { "info_need": current_info_need, "model": bert_model, "num_epochs": num_epochs, "condition": condition, "acc": acc, "f1_other": f1other, "f1_infoneed": f1infoneed, "precision_infoneed": precision_infoneed, "recall_infoneed": recall_infoneed, "recall_other": recall_other, "precision_other": precision_other, "recall_macro": recall_macro, "precision_macro": precision_macro, "recall_micro": recall_micro, "precision_micro": precision_micro, "recall_weighted": recall_weighted, "precision_weighted": precision_weighted, "f1_weighted": f1weighted, "f1_macro": f1macro, "f1_micro": f1micro, "f1_weighted": f1weighted, "mcc": mcc } register_metrics( f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs', evaluation_metrics) metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs' set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) logger, ml_logger = init_logging() tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=256, train_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv", test_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv", data_dir="data/", label_list=label_list, metric=metric, text_column_name="utterance", label_column_name=level, delimiter=";") data_silo = DataSilo(processor=processor, batch_size=batch_size) silos = DataSiloForCrossVal.make(data_silo, n_splits=folds, sets=['train', 'test']) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(bert_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout_prob, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=num_epochs, use_amp=None) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_infoneed", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=num_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=100, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_info_need = -1 language_model_name = bert_model if language_model_name.find("/") != -1: language_model_name = language_model_name.replace("/", "_") save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}" ) for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_info_need = result[0]["f1_infoneed"] if f1_info_need > bestf1_info_need: bestf1_info_need = f1_info_need bestfold = num_fold # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open( f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="Other") xval_f1_info_need = f1_score(all_labels, all_preds, labels=label_list, pos_label=current_info_need) xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_mcc = matthews_corrcoef(all_labels, all_preds) xval_overall_results = { "xval_f1_other": xval_f1_other, f"xval_f1_infoneed": xval_f1_info_need, "xval_f1_micro": xval_f1_micro, "xval_f1_macro": xval_f1_macro, "xval_f1_mcc": xval_mcc } logger.info(f"XVAL F1 MICRO: {xval_f1_micro}") logger.info(f"XVAL F1 MACRO: {xval_f1_macro}") logger.info(f"XVAL F1 OTHER: {xval_f1_other}") logger.info( f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs: {xval_f1_info_need}" ) logger.info(f"XVAL MCC: {xval_mcc}") # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}" ) model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"])) logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"])) logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"])) logger.info("TEST F1 {0}: {1}".format(current_info_need, result[0]["f1_infoneed"])) logger.info("TEST MCC: {}".format(result[0]["mcc"])) test_set_results = { "test_f1_other": result[0]["f1_other"], "test_f1_infoneed": result[0][f"f1_infoneed"], "test_f1_micro": result[0]["f1_micro"], "test_f1_macro": result[0]["f1_macro"], "test_f1_mcc": result[0]["mcc"] }
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'classification': raise Exception('NOT A CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, metric=metric, label_column_name="label", train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir')) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel ## Pretrained language model as a basis language_model = LanguageModel.load(lang_model) ## Prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"]), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, learning_rate=learning_rate, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if early_stopping: earlystopping = EarlyStopping( metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=save_dir, # where to save the best model patience= 2 # number of evaluations to wait for improvement before terminating the training ) else: earlystopping = None trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def doc_classification_crossvalidation(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: ml_logger = MLFlowLogger(tracking_uri="logs") # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## xval_folds = 5 xval_stratified = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. # For xval, we also store the actual predictions and labels in each result so we can # calculate overall metrics over all folds later def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_xval.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE") xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER") xval_mcc = matthews_corrcoef(all_labels, all_preds) logger.info("XVAL F1 MICRO: ", xval_f1_micro) logger.info("XVAL F1 MACRO: ", xval_f1_macro) logger.info("XVAL F1 OFFENSE: ", xval_f1_offense) logger.info("XVAL F1 OTHER: ", xval_f1_other) logger.info("XVAL MCC: ", xval_mcc) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: ", result[0]["f1_micro"]) logger.info("TEST F1 MACRO: ", result[0]["f1_macro"]) logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"]) logger.info("TEST F1 OTHER: ", result[0]["f1_other"]) logger.info("TEST MCC: ", result[0]["mcc"])
def __init__( self, tokenizer, max_seq_len, data_dir, metric=None, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", label_column_names=[], label_names=[], scaler_mean=None, scaler_scale=None, proxies=None, **kwargs, ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. :type data_dir: str :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param delimiter: Separator used in the input tsv / csv file. German version of Conll03 uses a whitespace. GermEval 2014 is tab separated \t :type delimiter: str :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels :type label_column_name: str :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases) :type label_name: str :param scaler_mean: Value to substract from the label for normalization :type scaler_mean: float :param scaler_scale: Value to divide the label by for normalization :type scaler_scale: float :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param kwargs: placeholder for passing generic parameters :type kwargs: object """ # Custom processor attributes self.delimiter = delimiter super(TokenRegressionProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric is None: metric = "token_level_regression_metrics" register_metrics(metric, token_level_regression_metrics) if label_column_names and label_names: for col_name, l_name in zip(label_column_names, label_names): self.add_task( name=l_name, metric=metric, label_list=[scaler_mean, scaler_scale], label_column_name=col_name, task_type="token_regression", label_name=l_name, ) else: logger.info( "Initialized processor without tasks. Supply `label_names` and `label_column_names` to the constructor for " "using the default task or add a custom task later via processor.add_task()" )
def __init__( self, tokenizer, max_seq_len, data_dir, metric=None, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", quote_char=csv.QUOTE_NONE, skiprows=None, label_column_names=[], label_names=[], scaler_mean=None, scaler_scale=None, proxies=None, start_feat_col=None, text_column_name="text", **kwargs, ): """ :param tokenizer: Used to split a sentence (str) into tokens. :param max_seq_len: Samples are truncated after this many tokens. :type max_seq_len: int :param data_dir: The directory in which the train and dev files can be found. :type data_dir: str :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"] :type label_list: list :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro". Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value. For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn]. :type metric: str, function, or list :param train_filename: The name of the file containing training data. :type train_filename: str :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set will be a slice of the train set. :type dev_filename: str or None :param test_filename: None :type test_filename: str :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :type dev_split: float :param delimiter: Separator used in the input tsv / csv file :type delimiter: str :param quote_char: Character used for quoting strings in the input tsv/ csv file :type quote_char: str :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers) :type skiprows: int :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels :type label_column_name: str :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases) :type label_name: str :param scaler_mean: Value to substract from the label for normalization :type scaler_mean: float :param scaler_scale: Value to divide the label by for normalization :type scaler_scale: float :param proxies: proxy configuration to allow downloads of remote datasets. Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies :type proxies: dict :param text_column_name: name of the column in the input csv/tsv that shall be used as training text :type text_column_name: str :param kwargs: placeholder for passing generic parameters :type kwargs: object """ # Custom processor attributes self.delimiter = delimiter self.quote_char = quote_char self.skiprows = skiprows self.text_column_name = text_column_name self.features = start_feat_col self.feat_size = None super(CustomRegressionProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric is None: metric = "regression_metrics" register_metrics(metric, regression_metrics) if label_column_names and label_names: for col_name, l_name in zip(label_column_names, label_names): self.add_task( name=l_name, metric=metric, label_list=[scaler_mean, scaler_scale], label_column_name=col_name, task_type="regression", label_name=l_name, )
def doc_classification_holdout(): ########################## ########## Logging ########################## logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) # reduce verbosity from transformers library logging.getLogger('transformers').setLevel(logging.WARNING) # local logging into directory "logs" mlflogger = MLFlowLogger(tracking_uri="logs") mlflogger.init_experiment(experiment_name="Example-docclass-xval", run_name="testrun1") ########################## ########## Settings ########################## holdout_splits = 5 holdout_train_split = 0.8 holdout_stratification = True set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" dev_split = 0.1 # For holdout the dev_stratification parameter must not be None: with None, the devset cannot be created # using the default method of only splitting by the available chunks as initial train set for each fold # is just a single chunk! dev_stratification = True do_lower_case = False use_amp = None # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro, "mcc": mcc } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, dev_split=dev_split, dev_stratification=dev_stratification, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Load one silo for each fold in our cross-validation silos = DataSiloForHoldout.make(data_silo, sets=["train", "dev"], n_splits=holdout_splits, train_split=holdout_train_split, stratification=holdout_stratification) # the following steps should be run for each of the folds of the holdout evaluation, so we put them # into a function def train_on_split(silo_to_use, n_eval, save_dir): logger.info( f"############ Holdout: Evaluation {n_eval} of {holdout_splits} ############" ) logger.info( f"Fold training samples: {len(silo_to_use.data['train'])}") logger.info(f"Fold dev samples: {len(silo_to_use.data['dev'])}") logger.info( f"Fold testing samples: {len(silo_to_use.data['test'])}") logger.info( "Total number of samples: " f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}" ) # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_eval}") earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # remember all individual evaluation results allresults = [] bestfold = None bestf1_offense = -1 save_dir = Path("saved_models/bert-german-doc-tutorial-es") for num_fold, silo in enumerate(silos): mlflow.start_run(run_name=f"split-{num_fold + 1}-of-{len(silos)}", nested=True) model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) # keep track of best fold f1_offense = result[0]["f1_offense"] if f1_offense > bestf1_offense: bestf1_offense = f1_offense bestfold = num_fold mlflow.end_run() # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open("doc_classification_holdout.results.json", "wt") as fp: json.dump(allresults, fp) # log the best fold metric and fold logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}") # calculate overall metrics across all folds: we only have one head so we do this only for the first head # information in each of the per-fold results # First create a dict where for each metric, we have a list of values from each fold eval_metric_lists_head0 = defaultdict(list) for results in allresults: head0results = results[0] for name in head0results.keys(): if name not in ["preds", "labels"] and not name.startswith("_") and \ isinstance(head0results[name], numbers.Number): eval_metric_lists_head0[name].append(head0results[name]) # Now calculate the mean and stdev for each metric, also copy over the task name eval_metric = {} eval_metric["task_name"] = allresults[0][0].get("task_name", "UNKNOWN TASKNAME") for name in eval_metric_lists_head0.keys(): values = eval_metric_lists_head0[name] vmean = statistics.mean(values) vstdev = statistics.stdev(values) eval_metric[name + "_mean"] = vmean eval_metric[name + "_stdev"] = vstdev logger.info( f"HOLDOUT Accuracy: mean {eval_metric['acc_mean']} stdev {eval_metric['acc_stdev']}" ) logger.info( f"HOLDOUT F1 MICRO: mean {eval_metric['f1_micro_mean']} stdev {eval_metric['f1_micro_stdev']}" ) logger.info( f"HOLDOUT F1 MACRO: mean {eval_metric['f1_macro_mean']} stdev {eval_metric['f1_macro_stdev']}" ) logger.info( f"HOLDOUT F1 OFFENSE: mean {eval_metric['f1_offense_mean']} stdev {eval_metric['f1_offense_stdev']}" ) logger.info( f"HOLDOUT F1 OTHER: mean {eval_metric['f1_other_mean']} stdev {eval_metric['f1_other_stdev']}" ) logger.info( f"HOLDOUT MCC: mean {eval_metric['mcc_mean']} stdev {eval_metric['mcc_stdev']}" ) # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}") model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info(f"TEST Accuracy: {result[0]['acc']}") logger.info(f"TEST F1 MICRO: {result[0]['f1_micro']}") logger.info(f"TEST F1 MACRO: {result[0]['f1_macro']}") logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}") logger.info(f"TEST F1 OTHER: {result[0]['f1_other']}") logger.info(f"TEST MCC: {result[0]['mcc']}")
TRIGGER_LABELS = ["X", "0", "1"] LABEL_LIST = ["not sw", "sw"] processor = MTLProcessor(data_dir = ".", tokenizer=tokenizer, max_seq_len=128, train_filename=TRAIN_FILE, test_filename=TEST_FILE, delimiter=",", ) from farm.evaluation.metrics import register_metrics register_metrics('f1_weighted', custom_f1_score) metric = 'f1_weighted' processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification") processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner") data_silo = DataSilo(processor=processor, batch_size=BATCH_SIZE ) language_model = LanguageModel.load(LANG_MODEL) document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task") token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task")