def convert_from_transformers(cls, model_name_or_path, device, task_type): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) #TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError( f"Huggingface's transformer models of type {task_type} are not supported yet" ) return adaptive_model
def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) # TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment.") raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError(f"Huggingface's transformer models of type {task_type} are not supported yet") if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def test_multiple_prediction_heads(): model = "bert-base-german-cased" lm = LanguageModel.load(model) ph1 = TextClassificationHead(num_labels=3, label_list=["negative", "neutral", "positive"]) ph2 = TokenClassificationHead(num_labels=3, label_list=["PER", "LOC", "ORG"]) adaptive_model = AdaptiveModel(language_model=lm, prediction_heads=[ph1, ph2], embeds_dropout_prob=0.1, lm_output_types="per_token", device="cpu") transformer_models = Converter.convert_to_transformers(adaptive_model) assert isinstance(transformer_models[0], BertForSequenceClassification) assert isinstance(transformer_models[1], BertForTokenClassification) del lm del transformer_models del adaptive_model
processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( task_name="ner", layer_dims=[768, len(processor.tasks["ner"]["label_list"])]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1,
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = "saved_models/bert-german-ner-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 50 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = GermEval14Processor(tokenizer=tokenizer, max_seq_len=64, data_dir="samples/ner", train_file="train-sample.txt", dev_file="dev-sample.txt", test_file=None) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TokenClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_examples=data_silo.n_samples("train"), batch_size=batch_size, n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) save_dir = "testsave/ner" model = trainer.train(model) model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, ] model = Inferencer(save_dir) result = model.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Tagesspiegel," assert abs(result[0]["predictions"][0]["probability"] - 0.213869) <= 0.0001
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir="samples/ner", train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1" ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) save_dir = "testsave/ner" model = trainer.train(model) model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "sagte" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def execML(self, job): start_time = time.time() if job.task == 'analyse': basic_texts = [] # Will donwload and store dataset... sample = self.downloadAndConvertText(job, job.data_sample) for text in sample.encode('utf-8').splitlines(): basic_texts.append({'text': text.decode('utf-8')}) # Will donwload and store model... self.downloadAndStoreZIPModel(job, job.model) self.updateJobStatus(job, 'analysing') save_dir = 'tmp/' + job.model['id'] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) self.persistResult(job, result) model.close_multiprocessing_pool() self.updateJobStatus(job, 'completed') elif job.task == 'train': self.updateJobStatus(job, 'training') # Will donwload and store dataset... self.downloadAndStoreZIPDataset(job, job.data_source) # Will donwload and store model... self.downloadAndStoreZIPModel(job, job.model) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 evaluate_every = 400 do_lower_case = False batch_size = 32 lang_model = os.path.join(Path.cwd(), 'tmp', job.model['id']) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] # 1. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, tokenizer_class='BertTokenizer' ) #tokenizer_class='BertTokenizer' # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=str( os.path.join(Path.cwd(), 'tmp', job.data_source['id'])), delimiter=' ', metric='seq_f1', label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) # 4. Create an AdaptiveModel # 4.1 which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # 4.2 and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=['per_token'], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: newModelId = str(uuid.uuid4()) save_dir = 'tmp/' + newModelId model.save(save_dir) processor.save(save_dir) model.close_multiprocessing_pool() self.persistZIPModel(newModelId, job) self.updateJobStatus(job, 'completed') elapsed_time = time.time() - start_time print('Execution time max: ', elapsed_time, 'for job.id:', job.id, flush=True) return {'status': True, 'code': 'ok', 'msg': 'success'}
def test_ner(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Paris is a town in France." }, ] model = Inferencer.load( model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner") # labels arent correctly inserted from transformers # They are converted to LABEL_1 ... LABEL_N # For the inference result to contain predictions we need them in IOB NER format model.processor.tasks["ner"]["label_list"][-1] = "B-LOC" result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Paris" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
from farm.evaluation.metrics import register_metrics register_metrics('f1_weighted', custom_f1_score) metric = 'f1_weighted' processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification") processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner") data_silo = DataSilo(processor=processor, batch_size=BATCH_SIZE ) language_model = LanguageModel.load(LANG_MODEL) document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task") token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task") model = AdaptiveModel( language_model=language_model, prediction_heads=[document_level_task_head, token_level_task_head], embeds_dropout_prob=EMBEDS_DROPOUT_PROB, lm_output_types=["per_sequence", "per_token"], device=DEVICE, loss_aggregation_fn=my_loss_agg) model, optimizer, lr_schedule = initialize_optimizer( model=model, device=DEVICE, learning_rate=LEARNING_RATE, n_batches=len(data_silo.loaders["train"]), n_epochs=N_EPOCHS)
def ner(self, task, model_type, n_epochs, batch_size, evaluate_every, use_cude): aml_run = he.get_context() # Check task if cu.tasks.get(str(task)).get('type') != 'ner': raise Exception('NOT A NER TASK') language = cu.params.get('language') # Data dt_task = dt.Data(task=task) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) lang_model = he.get_farm_model(model_type, language) save_dir = dt_task.get_path('model_dir') # ner_labels = dt_task.load('fn_label', header=None)[0].to_list() ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] # n_epochs = 4 # batch_size = 32 # evaluate_every = 750 # lang_model = "xlm-roberta-large" # AML log try: aml_run.log('task', task) aml_run.log('language', language) aml_run.log('n_epochs', n_epochs) aml_run.log('batch_size', batch_size) aml_run.log('lang_model', lang_model) aml_run.log_list('label_list', label_list) except: pass # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=dt_task.data_dir, metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def convert_from_transformers(model_name_or_path, device, revision=None, task_type=None, processor=None, **kwargs): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path, revision=revision, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] if "MaskedLM" in architecture: task_type = "lm" elif "QuestionAnswering" in architecture: task_type = "question_answering" elif "SequenceClassification" in architecture: if lm.model.config.num_labels == 1: task_type = "regression" else: task_type = "text_classification" elif "TokenClassification" in architecture: task_type = "ner" else: logger.error( "Could not infer task type from model config. Please provide task type manually. " "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')" ) if task_type == "lm": ph = BertLMHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "regression": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Regression with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = RegressionHead.load(model_name_or_path, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = am.AdaptiveModel( language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def ner(task: str, lm: str): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ########################## ########## Settings ########################## set_all_seeds(seed=42, deterministic_cudnn=use_cuda) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) n_epochs = 10 batch_size = 32 evaluate_every = 1000 model_dir = MODEL_DIR if lm == 'bert-hgcrw': lang_model = "redewiedergabe/bert-base-historical-german-rw-cased" model_dir += '_bert-hgcrw' elif lm == 'lmgot01': lang_model = Path( "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01") model_dir += '_lmgot01' elif lm == 'lmgot02': lang_model = Path( "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02") model_dir += '_lmgot02' else: lang_model = "bert-base-german-cased" if task != 'all': model_dir += '_' + task do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor if task == 'direct': ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"] elif task == 'indirect': ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"] elif task == 'reported': ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"] else: ner_labels = [ "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP", "I-REP" ] data_dir = DATA_DIR if task != 'all': data_dir += task + '/' processor = NERProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path(data_dir), delimiter="\t", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_loader_worker = 1 data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = model_dir model.save(save_dir) processor.save(save_dir)
def test_ner_amp(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, gpu=True) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) assert result[0]["predictions"][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 5 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Albrecht Lehman ist eine Person" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) #assert result[0]["predictions"][0]["context"] == "sagte" #assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]),
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ########################## ########## Settings ########################## set_all_seeds(seed=42, deterministic_cudnn=True) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_loader_worker = 15 data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = MODEL_DIR model.save(save_dir) processor.save(save_dir)