def test_doc_classification(): #caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = Roberta.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class_roberta" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Martin Müller spielt Handball in Berlin."}, {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."} ] inf = Inferencer.load(save_dir,batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"],np.float32)
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, use_cuda, max_seq_len, learning_rate, do_lower_case, register_model, save_model=True, early_stopping=False): language = cu.params.get('language') # Check task if cu.tasks.get(str(task)).get('type') != 'multi_classification': raise Exception('NOT A MULTI CLASSIFICATION TASK') # Data dt_task = dt.Data(task=task) ## Download training files if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')): dt_task.download('data_dir', dir='data_dir', source='datastore') # Settings set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') label_list = dt_task.load('fn_label', dir='data_dir', header=None)[0].to_list() # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('learning_rate', learning_rate) aml_run.log('embeds_dropout', embeds_dropout) aml_run.log('max_seq_len', max_seq_len) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") # AML log try: aml_run.log('acc', acc.get('acc')) aml_run.log('f1macro', f1macro) aml_run.log('f1micro', f1micro) except: pass return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro} register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=dt_task.data_dir, label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename=dt_task.get_path('fn_train', dir='data_dir'), test_filename=dt_task.get_path('fn_test', dir='data_dir'), dev_split=0.3) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( num_labels=len(processor.tasks["text_classification"]["label_list"])) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Store it: # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training if save_model: model.save(save_dir) processor.save(save_dir) if register_model: dt_task.upload('model_dir', destination='model')
def doc_classification_multilabel_roberta(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] metric = "acc" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename=Path("train.tsv"), dev_filename=Path("val.tsv"), test_filename=None, dev_split=0, max_samples=1000) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-multi-doc-roberta") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "You f*****g bastards" }, { "text": "What a lovely world" }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) print(result)
label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=3e-5,