def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def xlmr_qa_demo(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 3 grad_acc_steps = 8 n_epochs = 2 evaluate_every = 200 base_LM_model = "xlm-roberta-large" data_dir = Path("../data/squad20") train_filename = Path("train-v2.0.json") dev_filename = Path("dev-v2.0.json") save_dir = Path("../saved_models/xlmr-large-qa") inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json") predictions_file = save_dir / "predictions.json" full_predictions_file = save_dir / "full_predictions.json" max_processes_for_inference = 8 train = True inference = False if train: # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=data_dir, dev_split=0.0 ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model, n_added_tokens=3) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=grad_acc_steps, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai model = trainer.train(model) # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if inference: model = Inferencer.load(save_dir, batch_size=32, gpu=True) full_result = model.inference_from_file( file=inference_file, max_processes=max_processes_for_inference, ) for x in full_result: print(x) print() result = {r["id"]: r["preds"][0][0] for r in full_result} full_result = {r["id"]: r["preds"] for r in full_result} json.dump(result, open(predictions_file, "w"), indent=4, ensure_ascii=False) json.dump(full_result, open(full_predictions_file, "w"), indent=4, ensure_ascii=False)
epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai model = trainer.train(model) # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if inference: model = Inferencer.load(save_dir, batch_size=32, gpu=True) full_result = model.inference_from_file( file=inference_file, use_multiprocessing=inference_multiprocessing) for x in full_result: print(x) print() result = {r["id"]: r["preds"][0][0] for r in full_result} full_result = {r["id"]: r["preds"] for r in full_result} json.dump(result, open(predictions_file, "w"), indent=4, ensure_ascii=False) json.dump(full_result, open(full_predictions_file, "w"), indent=4,
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] train_filename = "train.tsv" dev_filename = "dev_200k.tsv" # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking generate_data = False data_dir = Path("../data/msmarco_passage") predictions_raw_filename = "predictions_raw.txt" predictions_filename = "predictions.txt" train_source_filename = "triples.train.1m.tsv" qrels_filename = "qrels.dev.tsv" queries_filename = "queries.dev.tsv" passages_filename = "collection.tsv" top1000_filename = "top1000.dev" # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once! # The final format is a tsv file with 3 columns (text, text_b and label) if generate_data: reformat_msmarco_train(data_dir / train_source_filename, data_dir / train_filename) reformat_msmarco_dev(data_dir / queries_filename, data_dir / passages_filename, data_dir / qrels_filename, data_dir / top1000_filename, data_dir / dev_filename) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Evaluation during training will be performed on a slice of the train set # We will be using the msmarco dev set as our final evaluation set processor = TextPairClassificationProcessor(tokenizer=tokenizer, label_list=label_list, metric="f1_macro", train_filename=train_filename, test_filename=None, dev_split=0.001, max_seq_len=128, data_dir=data_dir, delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead(num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification"), ) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/passage_ranking_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128) result = model.inference_from_file(data_dir / dev_filename) write_msmarco_results(result, save_dir / predictions_raw_filename) msmarco_evaluation(preds_file=save_dir / predictions_raw_filename, dev_file=data_dir / dev_filename, qrels_file=data_dir / qrels_filename, output_file=save_dir / predictions_filename) model.close_multiprocessing_pool()
model = trainer.train(model) # 8. Hooray! You have a model. Store it: save_dir = "../saved_models/bert-english-qa-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [ { "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = Inferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input) for x in result: pprint.pprint(x) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir,processor.dev_filename) result = model.inference_from_file(file=filename) write_squad_predictions( predictions=result, predictions_filename=filename, out_filename="predictions.json" )