def import_downstream_models(): ####################### loads a SQUAD finetuned model # saves it as a FARM adaptive model device, n_gpu = initialize_device_settings(use_cuda=True) model = "bert-large-uncased-whole-word-masking-finetuned-squad" save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad" lm = Bert.load(model) ph = QuestionAnsweringHead.load(model) am = AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) am.save(save_dir) # saves the processor associated with it, so you can use it in inference mode # TODO load HF's tokenizer_config.json and adjust settings tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=model) label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="../data/squad20", ) processor.save(save_dir)
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=False ) label_list = ["start_token", "end_token"] processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad" ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir)
def distilbert_nq(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=True ) processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train_sample.jsonl", dev_filename="dev_sample.jsonl", data_dir=Path("samples/nq") ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) qa_head = QuestionAnsweringHead() classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) trainer.train() return model, processor
def convert_from_transformers(cls, model_name_or_path, device, task_type): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) #TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError( f"Huggingface's transformer models of type {task_type} are not supported yet" ) return adaptive_model
def distilbert_squad(request): set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=True, use_fast=True # TODO parametrize this to test slow as well ) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir=Path("samples/qa"), label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() return model, processor
def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) # TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment.") raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError(f"Huggingface's transformer models of type {task_type} are not supported yet") if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 32 * 4 # 4x V100 n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnrecall = results[0]["top_n_recall"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 77.714 gold_tnrecall = 97.3721 # gold_elapsed = 1286.30 np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnrecall, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 1 recall by: {em_score - gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}") model.close_multiprocessing_pool()
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=base_LM_model, do_lower_case=False) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=16, max_query_length=4, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", labels=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5, warmup_proportion=0.2, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir) QA_input = [{ "questions": ["In what country is Normandy located?"], "text": 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', }] model = Inferencer.load(save_dir) result = model.run_inference(dicts=QA_input) assert isinstance(result[0]["predictions"][0]["end"], int)
def xlmr_qa_demo(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 3 grad_acc_steps = 8 n_epochs = 2 evaluate_every = 200 base_LM_model = "xlm-roberta-large" data_dir = Path("../data/squad20") train_filename = Path("train-v2.0.json") dev_filename = Path("dev-v2.0.json") save_dir = Path("../saved_models/xlmr-large-qa") inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json") predictions_file = save_dir / "predictions.json" full_predictions_file = save_dir / "full_predictions.json" max_processes_for_inference = 8 train = True inference = False if train: # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=data_dir, dev_split=0.0 ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model, n_added_tokens=3) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=grad_acc_steps, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai model = trainer.train(model) # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if inference: model = Inferencer.load(save_dir, batch_size=32, gpu=True) full_result = model.inference_from_file( file=inference_file, max_processes=max_processes_for_inference, ) for x in full_result: print(x) print() result = {r["id"]: r["preds"][0][0] for r in full_result} full_result = {r["id"]: r["preds"] for r in full_result} json.dump(result, open(predictions_file, "w"), indent=4, ensure_ascii=False) json.dump(full_result, open(full_predictions_file, "w"), indent=4, ensure_ascii=False)
def test_qa(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=True) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir=Path("samples/qa"), label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/qa") model.save(save_dir) processor.save(save_dir) inferencer = Inferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0) qa_format_1 = [{ "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] qa_format_2 = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.", }] result1 = inferencer.inference_from_dicts(dicts=qa_format_1) result2 = inferencer.inference_from_dicts(dicts=qa_format_2) assert result1 == result2
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
def convert_from_transformers(model_name_or_path, device, revision=None, task_type=None, processor=None, **kwargs): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path, revision=revision, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] if "MaskedLM" in architecture: task_type = "lm" elif "QuestionAnswering" in architecture: task_type = "question_answering" elif "SequenceClassification" in architecture: if lm.model.config.num_labels == 1: task_type = "regression" else: task_type = "text_classification" elif "TokenClassification" in architecture: task_type = "ner" else: logger.error( "Could not infer task type from model config. Please provide task type manually. " "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')" ) if task_type == "lm": ph = BertLMHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "regression": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Regression with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = RegressionHead.load(model_name_or_path, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = am.AdaptiveModel( language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import Bert from farm.modeling.tokenization import BertTokenizer from farm.modeling.prediction_head import QuestionAnsweringHead from farm.data_handler.processor import SquadProcessor from farm.utils import initialize_device_settings ####################### loads a SQUAD finetuned model # saves it as a FARM adaptive model device, n_gpu = initialize_device_settings(use_cuda=True) model = "bert-large-uncased-whole-word-masking-finetuned-squad" save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad" lm = Bert.load(model) ph = QuestionAnsweringHead.load(model) am = AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) am.save(save_dir) # saves the processor associated with it, so you can use it in inference mode # TODO load HF's tokenizer_config.json and adjust settings tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model) label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="../data/squad20",
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others batch_size = 40 * n_gpu_factor n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model test_assertions = False train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 78.6575 #77.714 gold_tnrecall = 97.3721 gold_elapsed = 1135 if test_assertions: np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg= f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnacc, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.01): error_messages.append( f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.01): error_messages.append( f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnrecall, rtol=0.01): error_messages.append( f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "train evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnrecall, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnrecall, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result) + "\n") return benchmark_result
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=False) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5, warmup_proportion=0.2, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir) QA_input = [{ "questions": ["In what country is Normandy"], "text": 'The Normans gave their name to Normandy, a region in France.', }] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=QA_input, use_multiprocessing=False) assert isinstance( result[0]["predictions"][0]["answers"][0]["offset_start"], int)
train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir="../data/squad20", ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=3e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]),