def train_on_split(silo_to_use, n_fold, save_dir): logger.info(f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ], class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir += f"-{n_fold}" earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping, evaluator_test=False) # train it model = trainer.train(model) return model
def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def train_on_split(args, silo, processor, fold=None): if args.folds > 1: args.logger.info( f"############ Crossvalidation: Fold {fold} ############") language_model = CustomLanguageModel.load( args.model_name, language_model_class=args.model_class_name) if args.prediction_layer > 0: language_model.enable_hidden_states_output() prediction_heads, out_types = load_prediction_heads(args, silo) # Sum all by default loss_fct = None if args.task_weights is None else compute_weighted_loss( args.task_weights, args.label_columns) # Create an AdaptiveModel = LM + prediction head(s) model = CustomAdaptiveModel( language_model=language_model, prediction_heads=prediction_heads, embeds_dropout_prob=args.embed_dropout_prob, lm_output_types=out_types, device=args.device, loss_aggregation_fn=loss_fct, head_feats=args.do_feat_embeds, freeze_model=args.freeze_model, custom_pooling_strategy=args.pooling_strategy, prediction_layer=args.prediction_layer, ) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=args.learning_rate, n_batches=len(silo.loaders["train"]), n_epochs=args.num_train_epochs, device=args.device, grad_acc_steps=args.grad_acc_steps, ) # Setup early stopping earlystopping = None fold_save_dir = f"{args.save_dir}_{fold}" if args.folds > 1 else args.save_dir if args.patience is not None: earlystopping = EarlyStopping(metric="loss", mode="min", save_dir=fold_save_dir, patience=args.patience) # Feed everything to the trainer trainer = MultitaskTrainer( model=model, optimizer=optimizer, data_silo=silo, epochs=args.num_train_epochs, n_gpu=args.n_gpu, device=args.device, lr_schedule=lr_schedule, evaluate_every=args.evaluate_every, early_stopping=earlystopping, evaluator_test=False, eval_report=False, ) # Let it grow trainer.train() if args.patience is None: # Store the model, only if it wasn't already saved by early stopping model.save(fold_save_dir) processor.save(fold_save_dir) return trainer.model
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def doc_classifcation(): device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) tokenizer = AutoTokenizer.from_pretrained(lang_model, strip_accents=False) #tokenizer = Tokenizer.load( # pretrained_model_name_or_path=lang_model, # do_lower_case=do_lower_case) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("./data/germeval18"), label_list=label_list, metric=metric, dev_filename="test.tsv", # we want to evaluate against test label_column_name="coarse_label", ) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) earlystopping = EarlyStopping( metric=metric, mode="max", #save_dir=Path("./saved_models"), patience=3 ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, early_stopping=earlystopping, device=device) trainer.train() return earlystopping.best_so_far
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'classification': raise Exception('NOT A CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, metric=metric, label_column_name="label", train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir')) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel ## Pretrained language model as a basis language_model = LanguageModel.load(lang_model) ## Prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"]), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, learning_rate=learning_rate, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if early_stopping: earlystopping = EarlyStopping( metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=save_dir, # where to save the best model patience= 2 # number of evaluations to wait for improvement before terminating the training ) else: earlystopping = None trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ###### ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = TextPairClassificationProcessor( tokenizer=tokenizer, label_list=label_list, metric="acc", label_column_name="label", max_seq_len=64, train_filename=training_filename, dev_filename=test_filename, test_filename=test_filename, data_dir=Path("../data"), tasks={"text_classification"}, delimiter="\t") # train_filename = training_filename, # test_filename = test_filename, # dev_filename = test_filename, # dev_split = 0.5, # data_dir=Path("../data/asnq_binary"), # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # Alte Version vor StreamingDataSilo # data_silo = DataSilo( # processor=processor, # batch_size=batch_size, max_processes=4) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-6, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) now = datetime.now() # current date and time # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( #metric="f1_weighted", mode="max", # use f1_macro from the dev evaluator of the trainer metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path( "saved_models/earlystopping/" + now.strftime("%m%d%Y%H%M%S")), # where to save the best model patience= 8 # number of evaluations to wait for improvement before terminating the training ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # model=model, # optimizer=optimizer, # data_silo=data_silo, # epochs=n_epochs, # n_gpu=n_gpu, # lr_schedule=lr_schedule, # evaluate_every=evaluate_every, # device=device) # 7. Let it grow #comment if going to use a stored model trainer.train() # 8. Hooray! You have a model. Store it: # When a new model is being trained and need to be saved save_dir = Path("saved_models/text_pair_classification_model" + now.strftime("%m%d%Y%H%M%S")) model.save(save_dir) processor.save(save_dir) # When only a model needs to be loaded change the details to load the needed model # save_dir = Path("saved_models/text_pair_classification_model" + "01272021103548") # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide basic_texts = [ { "text": "<claim-text>The method of claim 10, wherein the indium metal layer is 10 nm to 100 µm thick.</claim-text>", "text_b": "<p id=" "p0001" " num=" "0001" ">The present invention is directed to metal plating compositions and methods. More specifically, the present invention is directed to metal plating compositions and methods which provide improved leveling and throwing power.</p <p id=" "p0039" " num=" "0039" ">One or more conventional surfactants may be used. Typically, surfactants include, but are not limited to, nonionic surfactants such as alkyl phenoxy polyethoxyethanols. Other suitable surfactants containing multiple oxyethylene groups also may be used. Such surfactants include compounds of polyoxyethylene polymers having from as many as 20 to 150 repeating units. Such compounds also may perform as suppressors. Also included in the class of polymers are both block and random copolymers of polyoxyethylene (EO) and polyoxypropylene (PO). Surfactants may be added in conventional amounts, such as from 0.05 g/L to 20 g/L or such as from 0.5 g/L to 5 g/L.</p <p id=" "p0040" " num=" "0040" ">Conventional levelers include, but are not limited to, one or more of alkylated polyalkyleneimines and organic sulfo sulfonates. Examples of such compounds include, 4-mercaptopyridine, 2-mercaptothiazoline, ethylene thiourea, thiourea, 1-(2-hydroxyethyl)-2-imidazolidinethion (HIT) and alkylated polyalkyleneimines. Such levelers are included in conventional amounts. Typically, such levelers are included in amounts of 1ppb to 1 g/L, or such as from 10ppb to 500ppm.</p <p id=" "p0042" " num=" "0042" ">Alkali metal salts which may be included in the plating compositions include, but are not limited to, sodium and potassium salts of halogens, such as chloride, fluoride and bromide. Typically chloride is used. Such alkali metal salts are used in conventional amounts.</p <p id=" "p0053" " num=" "0053" ">The metal plating compositions may be used to plate a metal or metal alloy on a substrate by any method known in the art and literature. Typically, the metal or metal alloy is electroplated using conventional electroplating processes with conventional apparatus. A soluble or insoluble anode may be used with the electroplating compositions.</p <p id=" "p0022" " num=" "0022" ">One or more sources of metal ions are included in metal plating compositions to plate metals. The one or more sources of metal ions provide metal ions which include, but are not limited to, copper, tin, nickel, gold, silver, palladium, platinum and indium. Alloys include, but are not limited to, binary and ternary alloys of the foregoing metals. Typically, metals chosen from copper, tin, nickel, gold, silver or indium are plated with the metal plating compositions. More typically, metals chosen from copper, tin, silver or indium are plated. Most typically, copper is plated.</p <p id=" "p0030" " num=" "0030" ">Indium salts which may be used include, but are not limited to, one or more of indium salts of alkane sulfonic acids and aromatic sulfonic acids, such as methanesulfonic acid, ethanesulfonic acid, butane sulfonic acid, benzenesulfonic acid and toluenesulfonic acid, salts of sulfamic acid, sulfate salts, chloride and bromide salts of indium, nitrate salts, hydroxide salts, indium oxides, fluoroborate salts, indium salts of carboxylic acids, such as citric acid, acetoacetic acid, glyoxylic acid, pyruvic acid, glycolic acid, malonic acid, hydroxamic acid, iminodiacetic acid, salicylic acid, glyceric acid, succinic acid, malic acid, tartaric acid, hydroxybutyric acid, indium salts of amino acids, such as arginine, aspartic acid, asparagine, glutamic acid, glycine, glutamine, leucine, lysine, threonine, isoleucine, and valine.</p" }, { "text": "<claim-text>A toner comprising: <claim-text>toner base particles; and</claim-text> <claim-text>an external additive,</claim-text> <claim-text>the toner base particles each comprising a binder resin and a colorant,</claim-text> <claim-text>wherein the external additive comprises coalesced particles,</claim-text> <claim-text>wherein the coalesced particles are each a non-spherical secondary particle in which primary particles are coalesced together, and</claim-text> <claim-text>wherein an index of a particle size distribution of the coalesced particles is expressed by the following Formula (1): <maths id=" "math0004" " num=" "(formula (1)" "><math display=" "block" "><mfrac><msub><mi>Db</mi><mn>50</mn></msub><msub><mi>Db</mi><mn>10</mn></msub></mfrac><mo>≦</mo><mn>1.20</mn></math><img id=" "ib0008" " file=" "imgb0008.tif" " wi=" "93" " he=" "21" " img-content=" "math" " img-format=" "tif" "/></maths><br/> where, in a distribution diagram in which particle diameters in nm of the coalesced particles are on a horizontal axis and cumulative percentages in % by number of the coalesced particles are on a vertical axis and in which the coalesced particles are accumulated from the coalesced particles having smaller particle diameters to the coalesced particles having larger particle diameters, Db<sub>50</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 50% by number, and Db<sub>10</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 10% by number.</claim-text></claim-text>", "text_b": "<p id=" "p0177" " num=" "0177" ">For a similar reason, it is preferred that the electroconductive fine powder has a volume-average particle size of 0.5 - 5 µm, more preferably 0.8 - 5 µm, further preferably 1.1 - 5 µm and has a particle size distribution such that particles of 0.5 µm or smaller occupy at most 70 % by volume and particles of 5.0 µm or larger occupy at most 5 % by number.</p <p id=" "p0189" " num=" "0189" ">The volume-average particle size and particle size distribution of the electroconductive fine powder described herein are based on values measured in the following manner. A laser diffraction-type particle size distribution measurement apparatus (" "Model LS-230" ", available from Coulter Electronics Inc.) is equipped with a liquid module, and the measurement is performed in a particle size range of 0.04 - 2000 µm to obtain a volume-basis particle size distribution. For the measurement, a minor amount of surfactant is added to 10 cc of pure water and 10 mg of a sample electroconductive fine powder is added thereto, followed by 10 min. of dispersion by means of an ultrasonic disperser (ultrasonic homogenizer) to obtain a sample dispersion liquid, which is subjected to a single time of measurement for 90 sec.</p <p id=" "p0191" " num=" "0191" ">In the case where the electroconductive fine powder is composed of agglomerate particles, the particle size of the electroconductive fine powder is determined as the particle size of the agglomerate. The electroconductive fine powder in the form of agglomerated secondary particles can be used as well as that in the form of primary particles. Regardless of its agglomerated form, the electroconductive fine powder can exhibit its desired function of charging promotion by presence in the form of the agglomerate in the charging section at the contact position<!-- EPO <DP n=" "85" "> --> between the charging member and the image-bearing member or in a region in proximity thereto.</p" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool()
model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode="max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir="saved_models/bert-german-doc-tutorial-es", # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow
def run_experiment(args): logger.info("\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************") ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, use_amp=args.general.use_amp, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) set_all_seeds(args.general.seed) # Prepare Data tokenizer = Tokenizer.load(args.parameter.model, do_lower_case=args.parameter.lower_case) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=Path(args.general.data_dir), **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: task_names = list(processor.tasks.keys()) if len(task_names) > 1: raise NotImplementedError( f"Balancing classes is currently not supported for multitask experiments. Got tasks: {task_names} " ) class_weights = data_silo.calculate_class_weights( task_name=task_names[0]) model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer optimizer_opts = args.optimizer.optimizer_opts.toDict( ) if args.optimizer.optimizer_opts else None schedule_opts = args.optimizer.schedule_opts.toDict( ) if args.optimizer.schedule_opts else None model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=args.optimizer.learning_rate, schedule_opts=schedule_opts, optimizer_opts=optimizer_opts, use_amp=args.general.use_amp, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, device=device) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if "early_stopping" in args: early_stopping = EarlyStopping( metric=args.task.metric, mode=args.early_stopping.mode, save_dir=Path( f"{args.general.output_dir}/{model_name}_early_stopping" ), # where to save the best model patience=args.early_stopping. patience # number of evaluations to wait for improvement before terminating the training ) else: early_stopping = None trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, use_amp=args.general.use_amp, local_rank=args.general.local_rank, lr_schedule=lr_schedule, evaluate_every=args.logging.eval_every, device=device, early_stopping=early_stopping) model = trainer.train() processor.save(Path(f"{args.general.output_dir}/{model_name}")) model.save(Path(f"{args.general.output_dir}/{model_name}")) ml_logger.end_run()
def outcome_pretraining(task_config, model_name, cache_dir, run_name="0", lr=1e-05, warmup_steps=5000, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="loss", early_stopping_mode="min", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=True, do_train=True, do_eval=True, do_hpo=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = Path(task_config["data"]["data_dir"]) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = OutcomePretrainingProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], seed=seed, max_size_admission=50, max_size_discharge=50, cache_dir=cache_dir) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = OutcomePretrainingDataSilo( processor=processor, caching=True, cache_dir=cache_dir, batch_size=batch_size, max_multiprocessing_chunksize=200) if do_train: # Set save dir for experiment output save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}' # Use HPO config args if config is passed if do_hpo: save_dir = save_dir / tune.session.get_trial_name() else: exp_name = f"exp_{random.randint(100000, 999999)}" save_dir = save_dir / exp_name # Create save dir if not os.path.exists(save_dir): os.makedirs(save_dir) # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment(experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name, language_model_class=model_class) # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model if model_class == "Bert": next_sentence_head = NextSentenceHead.load(model_class) else: next_sentence_head = TextClassificationHead(num_labels=2) model = AdaptiveModel( language_model=language_model, prediction_heads=[next_sentence_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer schedule_opts = {"name": "LinearWarmup", "num_warmup_steps": warmup_steps} model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping( mode=early_stopping_mode, min_delta=0.0001, save_dir=save_dir, metric=early_stopping_metric, patience=early_stopping_patience ) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval ) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(save_dir / "final_model") processor.save(save_dir / "final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = save_dir else: model_dir = Path(model_name) logger.info("###### Eval on TEST SET #####") evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # Load trained model for evaluation model = AdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir / "eval_results.txt")