def test_dpr_training(): batch_size = 1 n_epochs = 1 distributed = False # enable for multi GPU training via DDP evaluate_every = 1 question_lang_model = "microsoft/MiniLM-L12-H384-uncased" passage_lang_model = "microsoft/MiniLM-L12-H384-uncased" do_lower_case = True use_fast = True similarity_function = "dot_product" device, n_gpu = initialize_device_settings(use_cuda=False) query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=question_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) label_list = ["hard_negative", "positive"] processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, max_seq_len_query=10, max_seq_len_passage=10, label_list=label_list, metric="text_similarity_metric", data_dir="samples/dpr/", train_filename="sample.json", dev_filename="sample.json", test_filename=None, embed_title=True, num_hard_negatives=1, dev_split=0, max_samples=2) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) question_language_model = LanguageModel.load( pretrained_model_name_or_path=question_lang_model, language_model_class="DPRQuestionEncoder") passage_language_model = LanguageModel.load( pretrained_model_name_or_path=passage_lang_model, language_model_class="DPRContextEncoder") prediction_head = TextSimilarityHead( similarity_function=similarity_function) model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \ "eps": 1e-08}, schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=1, device=device, distributed=distributed ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) trainer.train()
########################## set_all_seeds(seed=42) batch_size = 32 use_gpu = True device, n_gpu = initialize_device_settings(use_cuda=use_gpu) lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = LanguageModel.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist" }, {
def train_from_scratch(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) #TODO prettify this loading of params from two sources (cmd + json) cmd_args = parse_arguments() args["local_rank"] = cmd_args.local_rank logging.info(f'local_rank: {args["local_rank"]}') next_sent_task = bool(int(args.get("next_sent_task", 1))) distributed = True use_amp = args.get("use_amp", None) use_amp = None if use_amp == "" else use_amp # Only the main process should log here if args["local_rank"] in [-1, 0]: ml_logger = StdoutLogger(tracking_uri=None) ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args["local_rank"], use_amp=use_amp) effective_batch_size = int(args["per_gpu_batch_size"]) * int( args["gradient_accumulation_steps"] ) * torch.distributed.get_world_size() logging.info( f'Training with effective batch size of {effective_batch_size} ' f'(per_gpu_batch_size = {int(args["per_gpu_batch_size"])}, gradient_accumulation_steps={int(args["gradient_accumulation_steps"])}, n_gpus = {torch.distributed.get_world_size()} )' ) save_dir = Path("/opt/ml/model") data_dir = Path("/opt/ml/input/data/input_channel") # Split and shuffle training data if args["local_rank"] in [-1, 0]: split_file(data_dir / args["train_file"], output_dir=data_dir / "split_files") # let other processes wait for splitted files from rank 0 torch.distributed.barrier() args["train_file"] = data_dir / "split_files" # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / args["vocab_file"], do_lower_case=bool(int(args["do_lower_case"]))) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=int(args["max_seq_len"]), train_filename=args.get("train_file"), dev_filename=args.get("dev_file", None), test_filename=args.get("test_file", None), next_sent_pred_style=args.get( "next_sent_pred_style", "bert-style"), max_docs=args.get("max_docs", None), next_sent_pred=next_sent_task) # 3. Create a DataSilo that loads several datasets (train/dev/test) and provides DataLoaders for them data_silo = StreamingDataSilo(processor=processor, batch_size=int(args["per_gpu_batch_size"]), dataloader_workers=int( args.get("data_loader_workers", 8)), distributed=distributed) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) if next_sent_task: next_sentence_head = NextSentenceHead(num_labels=2, task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) else: model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=float(args["learning_rate"]), schedule_opts={ "name": "LinearWarmup", "warmup_proportion": float(args["warmup_proportion"]) }, n_batches=len(data_silo.get_data_loader("train")), n_epochs=int(args["n_epochs"]), device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), distributed=distributed, use_amp=use_amp, local_rank=args["local_rank"]) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time if args.get("checkpoint_every"): checkpoint_every = int(args["checkpoint_every"]) checkpoint_root_dir = Path("/opt/ml/checkpoints/training") else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=data_silo, epochs=int(args["n_epochs"]), n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=int(args["evaluate_every"]), log_loss_every=int(args.get("log_loss_every", 500)), log_learning_rate=bool(int(args.get("log_learning_rate", 0))), device=device, local_rank=args["local_rank"], grad_acc_steps=int(args["gradient_accumulation_steps"]), checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, checkpoints_to_keep=int(args.get("checkpoints_to_keep", 10)), disable_tqdm=True, use_amp=use_amp, ) # 7. Let it grow! trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def test_doc_classification(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}")
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others batch_size = 40 * n_gpu_factor n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model test_assertions = False train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 78.4385 gold_tnrecall = 97.3721 gold_elapsed = 1135 if test_assertions: np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg= f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnacc, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds" ) if not np.allclose(f1_score, gold_f1, rtol=0.01): error_messages.append( f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}" ) if not np.allclose(em_score, gold_EM, rtol=0.01): error_messages.append( f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}") if not np.allclose(tnacc, gold_tnrecall, rtol=0.01): error_messages.append( f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}" ) if not np.allclose(elapsed, gold_elapsed, rtol=0.1): error_messages.append( f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds" ) benchmark_result = [{ "run": "train evaluation", "f1_change": round(f1_score - gold_f1, 4), "em_change": round(em_score - gold_EM, 4), "tnacc_change": round(tnacc - gold_tnrecall, 4), "elapsed_change": round(elapsed - gold_elapsed, 4), "f1": f1_score, "em": em_score, "tnacc": round(tnacc, 4), "elapsed": elapsed, "f1_gold": gold_f1, "em_gold": gold_EM, "tnacc_gold": gold_tnrecall, "elapsed_gold": gold_elapsed }] logger.info("\n\n" + pformat(benchmark_result) + "\n") return benchmark_result
def __init__(self, document_store: BaseDocumentStore, query_embedding_model: Union[ Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[ Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product"): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. The checkpoint format matches huggingface transformers' model format **Example:** ```python | # remote model from FAIR | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") | # or from local path | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="model_directory/question-encoder", | passage_embedding_model="model_directory/context-encoder") ``` :param document_store: An instance of DocumentStore from which to retrieve documents. :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." :param use_gpu: Whether to use gpu or not :param batch_size: Number of questions or passages to encode at once :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.) . The title is expected to be present in doc.meta["name"] and can be supplied in the documents before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. """ self.document_store = document_store self.batch_size = batch_size self.max_seq_len_passage = max_seq_len_passage self.max_seq_len_query = max_seq_len_query if document_store is None: logger.warning( "DensePassageRetriever initialized without a document store. " "This is fine if you are performing DPR training. " "Otherwise, please provide a document store in the constructor." ) elif document_store.similarity != "dot_product": logger.warning( f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. " "We recommend you use dot_product instead. " "This can be set when initializing the DocumentStore") if use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.embed_title = embed_title # Init & Load Encoders self.query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=query_embedding_model, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class="DPRQuestionEncoderTokenizer") self.query_encoder = LanguageModel.load( pretrained_model_name_or_path=query_embedding_model, language_model_class="DPRQuestionEncoder") self.passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_embedding_model, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class="DPRContextEncoderTokenizer") self.passage_encoder = LanguageModel.load( pretrained_model_name_or_path=passage_embedding_model, language_model_class="DPRContextEncoder") self.processor = TextSimilarityProcessor( tokenizer=self.query_tokenizer, passage_tokenizer=self.passage_tokenizer, max_seq_len_passage=self.max_seq_len_passage, max_seq_len_query=self.max_seq_len_query, label_list=["hard_negative", "positive"], metric="text_similarity_metric", embed_title=self.embed_title, num_hard_negatives=0, num_positives=1) prediction_head = TextSimilarityHead( similarity_function=similarity_function) self.model = BiAdaptiveModel( language_model1=self.query_encoder, language_model2=self.passage_encoder, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=self.device, ) self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
from farm.evaluation.metrics import register_metrics register_metrics('f1_weighted', custom_f1_score) metric = 'f1_weighted' processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification") processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner") data_silo = DataSilo(processor=processor, batch_size=BATCH_SIZE ) language_model = LanguageModel.load(LANG_MODEL) document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task") token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task") model = AdaptiveModel( language_model=language_model, prediction_heads=[document_level_task_head, token_level_task_head], embeds_dropout_prob=EMBEDS_DROPOUT_PROB, lm_output_types=["per_sequence", "per_token"], device=DEVICE, loss_aggregation_fn=my_loss_agg) model, optimizer, lr_schedule = initialize_optimizer( model=model, device=DEVICE,
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor(data_dir="../data/lm_finetune_nips", tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename="train_small.txt", dev_filename="train_small.txt", test_filename=None) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer(
def test_lm_finetuning_no_next_sentence(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = BertStyleLMProcessor(data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, next_sent_pred=False) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning_no_nsp" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Farmer's life is great." }, { "text": "It's nothing for big city kids though." }, ] model = Inferencer.load(save_dir, embedder_only=True) result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == [ 'Farmer', "'", 's', 'life', 'is', 'great', '.' ] assert result[0]["vec"].shape == (768, ) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def dense_passage_retrieval(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval", run_name="Run_dpr") ########################## ########## Settings ########################## set_all_seeds(seed=42) batch_size = 4 n_epochs = 3 distributed = False # enable for multi GPU training via DDP evaluate_every = 1000 question_lang_model = "facebook/dpr-question_encoder-single-nq-base" passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base" do_lower_case = True use_fast = True embed_title = True num_hard_negatives = 1 similarity_function = "dot_product" train_filename = "nq-train.json" dev_filename = "nq-dev.json" test_filename = "nq-dev.json" max_samples = None # load a smaller dataset (e.g. for debugging) # For multi GPU Training via DDP we need to get the local rank args = parse_arguments() device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=question_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json label_list = ["hard_negative", "positive"] metric = "text_similarity_metric" processor = TextSimilarityProcessor(tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, max_seq_len_query=64, max_seq_len_passage=256, label_list=label_list, metric=metric, data_dir="../data/retriever", train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, embed_title=embed_title, num_hard_negatives=num_hard_negatives, max_samples=max_samples) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) # 4. Create an BiAdaptiveModel+ # a) which consists of 2 pretrained language models as a basis question_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder") passage_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder") # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = TextSimilarityHead( similarity_function=similarity_function) model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \ "eps": 1e-08}, schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=1, device=device, distributed=distributed ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/dpr-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Evaluate test_data_loader = data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator(data_loader=test_data_loader, tasks=data_silo.processor.tasks, device=device) model.connect_heads_with_processor(processor.tasks) test_result = evaluator_test.eval(model)
def train_from_scratch(args): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri=args.get( "mlflow_tracking_uri", "file:/opt/ml/model/mlflow")) ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True) evaluate_every = int(args["evaluate_every"]) save_dir = Path("/opt/ml/model") data_dir = Path("/opt/ml/input/data/input_channel") # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / args["vocab_file"], do_lower_case=args["do_lower_case"]) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor( data_dir=data_dir, tokenizer=tokenizer, max_seq_len=int(args["max_seq_len"]), train_filename=args["train_file"], dev_filename=args.get("dev_file", None), test_filename=args.get("test_file", None), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=int(args["batch_size"])) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=float(args["learning_rate"]), schedule_opts={ "name": "LinearWarmup", "warmup_proportion": float(args["warmup_proportion"]) }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=int(args["n_epochs"]), device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time if args.get("checkpoint_every"): checkpoint_every = int(args["checkpoint_every"]) checkpoint_root_dir = Path("/opt/ml/checkpoints/training") else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=int(args["n_epochs"]), n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=int(args["gradient_accumulation_steps"]), checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def outcome_pretraining(task_config, model_name, cache_dir, run_name="0", lr=1e-05, warmup_steps=5000, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="loss", early_stopping_mode="min", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=True, do_train=True, do_eval=True, do_hpo=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = Path(task_config["data"]["data_dir"]) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = OutcomePretrainingProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], seed=seed, max_size_admission=50, max_size_discharge=50, cache_dir=cache_dir) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = OutcomePretrainingDataSilo( processor=processor, caching=True, cache_dir=cache_dir, batch_size=batch_size, max_multiprocessing_chunksize=200) if do_train: # Set save dir for experiment output save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}' # Use HPO config args if config is passed if do_hpo: save_dir = save_dir / tune.session.get_trial_name() else: exp_name = f"exp_{random.randint(100000, 999999)}" save_dir = save_dir / exp_name # Create save dir if not os.path.exists(save_dir): os.makedirs(save_dir) # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment(experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name, language_model_class=model_class) # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model if model_class == "Bert": next_sentence_head = NextSentenceHead.load(model_class) else: next_sentence_head = TextClassificationHead(num_labels=2) model = AdaptiveModel( language_model=language_model, prediction_heads=[next_sentence_head], embeds_dropout_prob=embeds_dropout, lm_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer schedule_opts = {"name": "LinearWarmup", "num_warmup_steps": warmup_steps} model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping( mode=early_stopping_mode, min_delta=0.0001, save_dir=save_dir, metric=early_stopping_metric, patience=early_stopping_patience ) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval ) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(save_dir / "final_model") processor.save(save_dir / "final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = save_dir else: model_dir = Path(model_name) logger.info("###### Eval on TEST SET #####") evaluator_test = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device ) # Load trained model for evaluation model = AdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir / "eval_results.txt")
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ########################## ########## Settings ########################## set_all_seeds(seed=42, deterministic_cudnn=True) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_loader_worker = 15 data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = MODEL_DIR model.save(save_dir) processor.save(save_dir)
def test_ner_amp(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32)
def doc_regression(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/<YOUR-DATASET>"), label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text regression prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow model = trainer.train(model) # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-regression-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide basic_texts = [ { "text": "" }, { "text": "" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ], class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow model = trainer.train(model) # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result)
def test_doc_regression(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_regr"), train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1} ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) trainer.train() save_dir = Path("testsave/doc_regr") model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."}, {"text": "it just did not fit right. The top is very thin showing everything."}, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None ############################################# # CUSTOM OPTIMIZER & LR SCHEDULE ############################################# # learning rate schedules from transformers schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "Constant"} # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4} # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4} # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options) # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1} # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10} # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options) optimizer_opts = {"name": "SGD", "momentum": 0.0} # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options) # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True} # or from transformers (default in FARM) #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01} ############################################# device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-3, optimizer_opts=optimizer_opts, schedule_opts=schedule_opts, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool()
def test_dpr_modules(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True) context_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True) processor = TextSimilarityProcessor( tokenizer=query_tokenizer, passage_tokenizer=context_tokenizer, max_seq_len=256, label_list=["hard_negative", "positive"], metric="text_similarity_metric", data_dir="data/retriever", train_filename="nq-train.json", dev_filename="nq-dev.json", test_filename="nq-dev.json", embed_title=True, num_hard_negatives=1 ) question_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0) passage_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0) prediction_head = TextSimilarityHead(similarity_function="dot_product") model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.0, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) model.connect_heads_with_processor(processor.tasks) assert type(model) == BiAdaptiveModel assert type(processor) == TextSimilarityProcessor assert type(question_language_model) == DPRQuestionEncoder assert type(passage_language_model) == DPRContextEncoder # check embedding layer weights assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001 d = {'query': 'big little lies season 2 how many episodes', 'passages': [ {'title': 'Big Little Lies (TV series)', 'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley', 'label': 'positive', 'external_id': '18768923'}, {'title': 'Little People, Big World', 'text': 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend', 'label': 'hard_negative', 'external_id': '7459116'}, {'title': 'Cormac McCarthy', 'text': 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young', 'label': 'negative', 'passage_id': '2145653'} ] } sample = processor._dict_to_samples(d) feats = processor._sample_to_features(sample[0]) dataset, tensor_names = convert_features_to_dataset(feats) features = {key: val.unsqueeze(0).to(device) for key, val in zip(tensor_names, dataset[0])} # test features assert torch.all(torch.eq(features["query_input_ids"][0][:10].cpu(), torch.tensor([101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102]))) assert torch.all(torch.eq(features["passage_input_ids"][0][0][:10].cpu(), torch.tensor([101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186]))) assert len(features["query_segment_ids"][0].nonzero()) == 0 assert len(features["passage_segment_ids"][0].nonzero()) == 0 assert torch.all(torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10))))) assert torch.all(torch.eq(features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127))))) assert torch.all(torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143))))) # test model encodings query_vector = model.language_model1(**features)[0] passage_vector = model.language_model2(**features)[0] assert torch.all(torch.le(query_vector[0, :10].cpu() - torch.tensor([-0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405, 0.2285, 0.0893]), torch.ones((1, 10))*0.0001)) assert torch.all(torch.le(passage_vector[0, :10].cpu() - torch.tensor([0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969, -0.0555, 0.3405, -0.8691]), torch.ones((1, 10))*0.0001)) assert torch.all(torch.le(passage_vector[1, :10].cpu() - torch.tensor([-0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306, 0.1156, 0.3350, -0.3412]), torch.ones((1, 10)) * 0.0001)) # test logits and loss logits = model(**features) loss = model.logits_to_loss_per_head(logits, **features) similarity_scores = logits[0].cpu() assert torch.all(torch.le(similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]), torch.ones((1, 2)) * 0.0001)) assert (loss[0].item() - 0.0018) <= 0.0001
def lm_finetuning(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) set_all_seeds(seed=42) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment( experiment_name="Public_FARM", run_name="Run_minimal_example_lm" ) ########################## ########## Settings ########################## device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor( data_dir=Path("../data/lm_finetune_nips"), tokenizer=tokenizer, max_seq_len=128, max_docs=20 ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-english-lm-tutorial") model.save(save_dir) processor.save(save_dir)
def test_lm_finetuning_custom_vocab(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"]) processor = BertStyleLMProcessor( data_dir=Path("samples/lm_finetuning"), train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, next_sent_pred=True ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.get_added_vocab())) lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.get_added_vocab())) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1} ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) trainer.train() # LM embeddings and weight of decoder in head are shared and should therefore be equal assert torch.all( torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight)) save_dir = Path("testsave/lm_finetuning") model.save(save_dir) processor.save(save_dir) del model del processor del optimizer del data_silo del trainer basic_texts = [ {"text": "Farmer's life is great."}, {"text": "It's nothing for big city kids though."}, ] model = Inferencer.load(save_dir, task_type="embeddings", num_processes=0) result = model.extract_vectors(dicts=basic_texts) assert result[0]["vec"].shape == (768,) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) #TODO Infer type of head automatically from config if task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) else: raise NotImplementedError( f"Huggingface's transformer models of type {task_type} are not supported yet" ) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def convert_from_transformers(model_name_or_path, device, task_type=None, processor=None): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] if "MaskedLM" in architecture: task_type = "lm" elif "QuestionAnswering" in architecture: task_type = "question_answering" elif "SequenceClassification" in architecture: if lm.model.config.num_labels == 1: task_type = "regression" else: task_type = "text_classification" elif "TokenClassification" in architecture: task_type = "ner" else: logger.error( "Could not infer task type from model config. Please provide task type manually. " "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')" ) if task_type == "lm": ph = BertLMHead.load(model_name_or_path) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "regression": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Regression with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = RegressionHead.load(model_name_or_path) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = am.AdaptiveModel( language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def convert_to_onnx(cls, model_name, output_path, task_type, convert_to_float16=False, quantize=False, opset_version=11): """ Convert a PyTorch model from transformers hub to an ONNX Model. :param model_name: transformers model name :type model_name: str :param output_path: output Path to write the converted to :type output_path: Path :param task_type: Type of task for the model. Available options: "embeddings", "question_answering", "text_classification", "ner". :param convert_to_float16: By default, the model use float32 precision. With half precision of flaot16, inference should be faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, float32 might be more performant. :type convert_to_float16: bool :param quantize: convert floating point number to integers :type quantize: bool :param opset_version: ONNX opset version :type opset_version: int :return: """ language_model_class = LanguageModel.get_language_model_class( model_name) if language_model_class not in ["Bert", "Roberta", "XLMRoberta"]: raise Exception( "The current ONNX conversion only support 'BERT', 'RoBERTa', and 'XLMRoberta' models." ) task_type_to_pipeline_map = { "question_answering": "question-answering", "embeddings": "feature-extraction", "ner": "ner" } convert(pipeline_name=task_type_to_pipeline_map[task_type], framework="pt", model=model_name, output=output_path / "model.onnx", opset=opset_version, use_external_format=True if language_model_class is "XLMRoberta" else False) # save processor & model config files that are needed when loading the model with the FARM Inferencer processor = Processor.convert_from_transformers( tokenizer_name_or_path=model_name, task_type=task_type, max_seq_len=256, doc_stride=128, use_fast=True) processor.save(output_path) model = AdaptiveModel.convert_from_transformers(model_name, device="cpu", task_type=task_type) model.save(output_path) os.remove( output_path / "language_model.bin" ) # remove the actual PyTorch model(only configs are required) onnx_model_config = { "task_type": task_type, "onnx_opset_version": opset_version, "language_model_class": language_model_class, "language": model.language_model.language } with open(output_path / "onnx_model_config.json", "w") as f: json.dump(onnx_model_config, f) if convert_to_float16: from onnxruntime_tools import optimizer config = AutoConfig.from_pretrained(model_name) optimized_model = optimizer.optimize_model( input=str(output_path / "model.onnx"), model_type='bert', num_heads=config.num_hidden_layers, hidden_size=config.hidden_size) optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file("model.onnx") if quantize: quantize_model(output_path / "model.onnx")
def doc_classification_multilabel(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "bert-base-uncased" do_lower_case = True # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Toxic Comments Data automaticaly if it is not available. label_list = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] metric = "acc" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "You f*****g bastards" }, { "text": "What a lovely world" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool()
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 5 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] processor = NERProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1" ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1} ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Albrecht Lehman ist eine Person"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) #print(result) #assert result[0]["predictions"][0]["context"] == "sagte" #assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == [ 'a', 'man', 'is', 'walking', 'on', 'the', 'street', '.' ] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] + 0.21376857861379997 < 1e-6
def __init__(self, document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", single_model_path: Optional[Union[Path, str]] = None, model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", progress_bar: bool = True ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. The checkpoint format matches huggingface transformers' model format **Example:** ```python | # remote model from FAIR | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") | # or from local path | DensePassageRetriever(document_store=your_doc_store, | query_embedding_model="model_directory/question-encoder", | passage_embedding_model="model_directory/context-encoder") ``` :param document_store: An instance of DocumentStore from which to retrieve documents. :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the one used by hugging-face transformers' modelhub models Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` :param single_model_path: Local path or remote name of a query and passage embedder in one single model. Those models are typically trained within FARM. Currently available remote names: TODO add FARM DPR model to HF modelhub :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." :param top_k: How many documents to return per query. :param use_gpu: Whether to use gpu or not :param batch_size: Number of questions or passages to encode at once :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.) . The title is expected to be present in doc.meta["name"] and can be supplied in the documents before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. :param use_fast_tokenizers: Whether to use fast Rust tokenizers :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. """ self.document_store = document_store self.batch_size = batch_size self.progress_bar = progress_bar self.top_k = top_k if document_store is None: logger.warning("DensePassageRetriever initialized without a document store. " "This is fine if you are performing DPR training. " "Otherwise, please provide a document store in the constructor.") elif document_store.similarity != "dot_product": logger.warning(f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. " "We recommend you use dot_product instead. " "This can be set when initializing the DocumentStore") if use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.infer_tokenizer_classes = infer_tokenizer_classes tokenizers_default_classes = { "query": "DPRQuestionEncoderTokenizer", "passage": "DPRContextEncoderTokenizer" } if self.infer_tokenizer_classes: tokenizers_default_classes["query"] = None # type: ignore tokenizers_default_classes["passage"] = None # type: ignore # Init & Load Encoders if single_model_path is None: self.query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class=tokenizers_default_classes["query"]) self.query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model, revision=model_version, language_model_class="DPRQuestionEncoder") self.passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, tokenizer_class=tokenizers_default_classes["passage"]) self.passage_encoder = LanguageModel.load(pretrained_model_name_or_path=passage_embedding_model, revision=model_version, language_model_class="DPRContextEncoder") self.processor = TextSimilarityProcessor(query_tokenizer=self.query_tokenizer, passage_tokenizer=self.passage_tokenizer, max_seq_len_passage=max_seq_len_passage, max_seq_len_query=max_seq_len_query, label_list=["hard_negative", "positive"], metric="text_similarity_metric", embed_title=embed_title, num_hard_negatives=0, num_positives=1) prediction_head = TextSimilarityHead(similarity_function=similarity_function) self.model = BiAdaptiveModel( language_model1=self.query_encoder, language_model2=self.passage_encoder, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=self.device, ) else: self.processor = TextSimilarityProcessor.load_from_dir(single_model_path) self.processor.max_seq_len_passage = max_seq_len_passage self.processor.max_seq_len_query = max_seq_len_query self.processor.embed_title = embed_title self.processor.num_hard_negatives = 0 self.processor.num_positives = 1 # during indexing of documents only one embedding is created self.model = BiAdaptiveModel.load(single_model_path, device=self.device) self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False): # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=10, pca_n_components=300, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) print(result)