def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load from a local path: lang_model = Path("../saved_models/glove-german-uncased") # or through s3 #lang_model = "glove-german-uncased" do_lower_case = True device, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, dev_split=0, test_filename="test.tsv", train_filename="train.tsv", metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of an embedding model as a basis. # Word embedding models only converts words it has seen during training to embedding vectors. language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train()
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] train_filename = "train.tsv" dev_filename = "dev_200k.tsv" # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking generate_data = False data_dir = Path("../data/msmarco_passage") predictions_raw_filename = "predictions_raw.txt" predictions_filename = "predictions.txt" train_source_filename = "triples.train.1m.tsv" qrels_filename = "qrels.dev.tsv" queries_filename = "queries.dev.tsv" passages_filename = "collection.tsv" top1000_filename = "top1000.dev" # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once! # The final format is a tsv file with 3 columns (text, text_b and label) if generate_data: reformat_msmarco_train(data_dir / train_source_filename, data_dir / train_filename) reformat_msmarco_dev(data_dir / queries_filename, data_dir / passages_filename, data_dir / qrels_filename, data_dir / top1000_filename, data_dir / dev_filename) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Evaluation during training will be performed on a slice of the train set # We will be using the msmarco dev set as our final evaluation set processor = TextPairClassificationProcessor(tokenizer=tokenizer, label_list=label_list, metric="f1_macro", train_filename=train_filename, test_filename=None, dev_split=0.001, max_seq_len=128, data_dir=data_dir, delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead(num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification"), ) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/passage_ranking_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128) result = model.inference_from_file(data_dir / dev_filename) write_msmarco_results(result, save_dir / predictions_raw_filename) msmarco_evaluation(preds_file=save_dir / predictions_raw_filename, dev_file=data_dir / dev_filename, qrels_file=data_dir / qrels_filename, output_file=save_dir / predictions_filename) model.close_multiprocessing_pool()
def xlmr_qa_demo(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 3 grad_acc_steps = 8 n_epochs = 2 evaluate_every = 200 base_LM_model = "xlm-roberta-large" data_dir = Path("../data/squad20") train_filename = Path("train-v2.0.json") dev_filename = Path("dev-v2.0.json") save_dir = Path("../saved_models/xlmr-large-qa") inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json") predictions_file = save_dir / "predictions.json" full_predictions_file = save_dir / "full_predictions.json" max_processes_for_inference = 8 train = True inference = False if train: # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=data_dir, dev_split=0.0 ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model, n_added_tokens=3) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=grad_acc_steps, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai model = trainer.train(model) # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if inference: model = Inferencer.load(save_dir, batch_size=32, gpu=True) full_result = model.inference_from_file( file=inference_file, max_processes=max_processes_for_inference, ) for x in full_result: print(x) print() result = {r["id"]: r["preds"][0][0] for r in full_result} full_result = {r["id"]: r["preds"] for r in full_result} json.dump(result, open(predictions_file, "w"), indent=4, ensure_ascii=False) json.dump(full_result, open(full_predictions_file, "w"), indent=4, ensure_ascii=False)
def test_ner(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Paris is a town in France." }, ] model = Inferencer.load( model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner") # labels arent correctly inserted from transformers # They are converted to LABEL_1 ... LABEL_N # For the inference result to contain predictions we need them in IOB NER format model.processor.tasks["ner"]["label_list"][-1] = "B-LOC" result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Paris" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def run_experiment(args): validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, fp16=args.general.fp16, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) if n_gpu > 1: args.parameter.batch_size = args.parameter.batch_size * n_gpu set_all_seeds(args.general.seed) # Prepare Data tokenizer = BertTokenizer.from_pretrained( args.parameter.model, do_lower_case=args.parameter.lower_case) # processor = Processor.load( # tokenizer=tokenizer, # max_seq_len=args.parameter.max_seq_len, # data_dir=args.general.data_dir, # train_filename=args.task.train_filename, # dev_filename=args.task.dev_filename, # test_filename=args.task.test_filename, # dev_split=args.task.dev_split, # metrics=args.task.metrics, # **args.task.toDict(), # args is of type DotMap and needs conversion to std python dicts # ) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=args.general.data_dir, **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: task_names = list(processor.tasks.keys()) if len(task_names) > 1: raise NotImplementedError( f"Balancing classes is currently not supported for multitask experiments. Got tasks: {task_names} " ) class_weights = data_silo.calculate_class_weights( task_name=task_names[0]) model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this? optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=args.parameter.learning_rate, warmup_proportion=args.parameter.warmup_proportion, loss_scale=args.general.loss_scale, fp16=args.general.fp16, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, fp16=args.general.fp16, local_rank=args.general.local_rank, warmup_linear=warmup_linear, evaluate_every=args.logging.eval_every, device=device, ) model = trainer.train(model) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) processor.save(f"{args.general.output_dir}/{model_name}") model.save(f"{args.general.output_dir}/{model_name}")
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def train( self, data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, ): """ Fine-tune a model on a QA dataset. Options: - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) :param data_dir: Path to directory containing your training data in SQuAD style :param train_filename: Filename of training data :param dev_filename: Filename of dev / eval data :param test_filename: Filename of test data :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: Number of iterations on the whole training data set :param learning_rate: Learning rate of the optimizer :param max_seq_len: Maximum text length (in tokens). Everything longer gets cut down. :param warmup_proportion: Proportion of training steps until maximum learning rate is reached. Until that point LR is increasing linearly. After that it's decreasing again linearly. Options for different schedules are available in FARM. :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset :param save_dir: Path to store the final model :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing. Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. Set to None to use all CPU cores minus one. :param use_amp: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. Available options: None (Don't use AMP) "O0" (Normal FP32 training) "O1" (Mixed Precision => Recommended) "O2" (Almost FP16) "O3" (Pure FP16). See details on: https://nvidia.github.io/apex/amp.html :return: None """ if dev_filename: dev_split = 0 if num_processes is None: num_processes = multiprocessing.cpu_count() - 1 or 1 set_all_seeds(seed=42) # For these variables, by default, we use the value set when initializing the FARMReader. # These can also be manually set when train() is called if you want a different value at train vs inference if use_gpu is None: use_gpu = self.use_gpu if max_seq_len is None: max_seq_len = self.max_seq_len device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=use_amp) if not save_dir: save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}" # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, dev_split=dev_split, test_filename=test_filename, data_dir=Path(data_dir), ) # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=num_processes) # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( model=self.inferencer.model, # model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp, ) # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, use_amp=use_amp, disable_tqdm=not self.progress_bar) # 5. Let it grow! self.inferencer.model = trainer.train() self.save(Path(save_dir))
def train_from_scratch(): args = parse_arguments() use_amp = "O2" # using "O2" here allows roughly 30% larger batch_sizes and 45% speed up logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Only the main process should log here if args.local_rank in [-1, 0]: ml_logger = MLFlowLogger( tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank, use_amp=use_amp) save_dir = Path("saved_models/train_from_scratch") data_dir = Path("data/test") # Option A) just using a single file # train_filename = "train.txt" # Option B) (recommended when using StreamingDataSilo): # split and shuffle that file to have random order within and across epochs randomize_and_split_file(data_dir / "train.txt", output_dir=Path("data/split_files"), docs_per_file=1000) train_filename = Path("data/split_files") dev_filename = "dev.txt" distributed = args.local_rank != -1 max_seq_len = 128 batch_size = 8 #if distributed: this is per_gpu grad_acc = 1 learning_rate = 1e-4 warmup_proportion = 0.05 n_epochs = 2 evaluate_every = 15000 log_loss_every = 2 checkpoint_every = 500 checkpoint_root_dir = Path("checkpoints") checkpoints_to_keep = 4 next_sent_pred_style = "bert-style" #or "sentence" max_docs = None # Choose enough workers to queue sufficient batches during training. # Optimal number depends on your GPU speed, CPU speed and number of cores # 16 works well on a 4x V100 machine with 16 cores (AWS: p3.8xlarge). For a single GPU you will need less. data_loader_workers = 1 # 1.Create a tokenizer tokenizer = Tokenizer.load("bert-base-uncased", do_lower_case=True) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, next_sent_pred_style=next_sent_pred_style, max_docs=max_docs) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets # stream_data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) stream_data_silo = StreamingDataSilo( processor=processor, batch_size=batch_size, distributed=distributed, dataloader_workers=data_loader_workers) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead(num_labels=2, task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=grad_acc, distributed=distributed, use_amp=use_amp, local_rank=args.local_rank) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, log_loss_every=log_loss_every, device=device, grad_acc_steps=grad_acc, local_rank=args.local_rank, checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, checkpoints_to_keep=checkpoints_to_keep, use_amp=use_amp) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if args.local_rank != -1: torch.distributed.destroy_process_group()
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 5 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }, { "text": "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar" }, { "text": "Neues Video von Designern macht im Netz die Runde" }, { "text": "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden" }, { "text": "Aufständische verwendeten Chemikalie bei Gefechten im August." }, { "text": "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung" }, { "text": "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt" }, { "text": "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben" }, { "text": "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht" }] #TODO enable loading here again after we have finished migration towards "processor.tasks" #inf = Inferencer.load(save_dir) inf = Inferencer(model=model, processor=processor) result = inf.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1 loaded_processor = TextClassificationProcessor.load_from_dir(save_dir) inf2 = Inferencer(model=model, processor=loaded_processor) result_2 = inf2.run_inference(dicts=basic_texts) pprint(list(zip(result, result_2))) for r1, r2 in list(zip(result, result_2)): assert r1 == r2 # if(__name__=="__main__"): # test_doc_classification()
def test_lm_finetuning_no_next_sentence(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False ) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, next_sent_pred=False ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1} ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) # LM embeddings and weight of decoder in head are shared and should therefore be equal assert torch.all( torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight)) save_dir = "testsave/lm_finetuning_no_nsp" model.save(save_dir) processor.save(save_dir) basic_texts = [ {"text": "Farmer's life is great."}, {"text": "It's nothing for big city kids though."}, ] model = Inferencer.load(save_dir, embedder_only=True) result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.'] assert result[0]["vec"].shape == (768,) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def test_ner_amp(caplog): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 2 evaluate_every = 1 lang_model = "bert-base-german-cased" if AMP_AVAILABLE: use_amp = 'O1' else: use_amp = None tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename=Path("train-sample.txt"), dev_filename=Path("dev-sample.txt"), test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-05, schedule_opts=None, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, use_amp=use_amp) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "1980 kam der Crown von Toyota" }, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
def __init__(self, model, processor, task_type, batch_size=4, gpu=False, name=None, return_class_probs=False, extraction_strategy=None, extraction_layer=None, s3e_stats=None, num_processes=None, disable_tqdm=False, benchmarking=False, dummy_ph=False): """ Initializes Inferencer from an AdaptiveModel and a Processor instance. :param model: AdaptiveModel to run in inference mode :type model: AdaptiveModel :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset. :type processor: Processor :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param name: Name for the current Inferencer model, displayed in the REST API :type name: string :param return_class_probs: either return probability distribution over all labels or the prob of the associated label :type return_class_probs: bool :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors), 's3e' (sentence vector via S3E pooling, see https://arxiv.org/abs/2002.09620) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to debug the Language Model, you might need to disable multiprocessing! **Warning!** If you use multiprocessing you have to close the `multiprocessing.Pool` again! To do so call :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are done using this class. The garbage collector will not do this for you! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :param dummy_ph: If True, methods of the prediction head will be replaced with a dummy method. This is used to isolate lm run time from ph run time. :type dummy_ph: bool :param benchmarking: If True, a benchmarking object will be initialised within the class and certain parts of the code will be timed for benchmarking. Should be kept False if not benchmarking since these timing checkpoints require synchronization of the asynchronous Pytorch operations and may slow down the model. :type benchmarking: bool :return: An instance of the Inferencer. """ MLFlowLogger.disable() # For benchmarking if dummy_ph: model.bypass_ph() self.benchmarking = benchmarking if self.benchmarking: self.benchmarker = Benchmarker() # Init device and distributed settings device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) self.processor = processor self.model = model self.model.eval() self.batch_size = batch_size self.device = device self.language = self.model.get_language() self.task_type = task_type self.disable_tqdm = disable_tqdm self.problematic_sample_ids = set() if task_type == "embeddings": if not extraction_layer or not extraction_strategy: logger.warning( "Using task_type='embeddings', but couldn't find one of the args `extraction_layer` and `extraction_strategy`. " "Since FARM 0.4.2, you set both when initializing the Inferencer and then call inferencer.inference_from_dicts() instead of inferencer.extract_vectors()" ) self.model.prediction_heads = torch.nn.ModuleList([]) self.model.language_model.extraction_layer = extraction_layer self.model.language_model.extraction_strategy = extraction_strategy self.model.language_model.s3e_stats = s3e_stats # TODO add support for multiple prediction heads self.name = name if name != None else f"anonymous-{self.task_type}" self.return_class_probs = return_class_probs model.connect_heads_with_processor(processor.tasks, require_labels=False) set_all_seeds(42) self._set_multiprocessing_pool(num_processes)
def load( cls, model_name_or_path, revision=None, batch_size=4, gpu=False, task_type=None, return_class_probs=False, strict=True, max_seq_len=256, doc_stride=128, extraction_layer=None, extraction_strategy=None, s3e_stats=None, num_processes=None, disable_tqdm=False, tokenizer_class=None, use_fast=True, tokenizer_args=None, multithreading_rust=True, dummy_ph=False, benchmarking=False, ): """ Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by 1. specifying a public name from transformers' model hub (https://huggingface.co/models) 2. or pointing to a local directory it is saved in. :param model_name_or_path: Local directory or public name of the model to load. :type model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). Set to `False` for backwards compatibility with PHs saved with older version of FARM. :type strict: bool :param max_seq_len: maximum length of one text sample :type max_seq_len: int :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride :type doc_stride: int :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to debug the Language Model, you might need to disable multiprocessing! **Warning!** If you use multiprocessing you have to close the `multiprocessing.Pool` again! To do so call :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are done using this class. The garbage collector will not do this for you! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, True by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). :type use_fast: bool :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method. See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation on `Hugging Face Transformers <https://huggingface.co/transformers/>`_. :type tokenizer_args: dict :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers. Note: Enabling multithreading in Rust AND multiprocessing in python might cause deadlocks. :type multithreading_rust: bool :param dummy_ph: If True, methods of the prediction head will be replaced with a dummy method. This is used to isolate lm run time from ph run time. :type dummy_ph: bool :param benchmarking: If True, a benchmarking object will be initialised within the class and certain parts of the code will be timed for benchmarking. Should be kept False if not benchmarking since these timing checkpoints require synchronization of the asynchronous Pytorch operations and may slow down the model. :type benchmarking: bool :return: An instance of the Inferencer. """ if tokenizer_args is None: tokenizer_args = {} device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) name = os.path.basename(model_name_or_path) # a) either from local dir if os.path.exists(model_name_or_path): model = BaseAdaptiveModel.load(load_dir=model_name_or_path, device=device, strict=strict) if task_type == "embeddings": processor = InferenceProcessor.load_from_dir( model_name_or_path) else: processor = Processor.load_from_dir(model_name_or_path) # b) or from remote transformers model hub else: if not task_type: raise ValueError( "Please specify the 'task_type' of the model you want to load from transformers. " "Valid options for arg `task_type`:" "'question_answering', 'embeddings', 'text_classification', 'ner'" ) model = AdaptiveModel.convert_from_transformers( model_name_or_path, revision=revision, device=device, task_type=task_type) processor = Processor.convert_from_transformers( model_name_or_path, revision=revision, task_type=task_type, max_seq_len=max_seq_len, doc_stride=doc_stride, tokenizer_class=tokenizer_class, tokenizer_args=tokenizer_args, use_fast=use_fast) # override processor attributes loaded from config or HF with inferencer params processor.max_seq_len = max_seq_len processor.multithreading_rust = multithreading_rust if hasattr(processor, "doc_stride"): assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \ "as the passage windows slide, causing the model to skip over parts of the document. " \ "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) " processor.doc_stride = doc_stride return cls(model, processor, task_type=task_type, batch_size=batch_size, gpu=gpu, name=name, return_class_probs=return_class_probs, extraction_strategy=extraction_strategy, extraction_layer=extraction_layer, s3e_stats=s3e_stats, num_processes=num_processes, disable_tqdm=disable_tqdm, benchmarking=benchmarking, dummy_ph=dummy_ph)
def perform_fine_tuning(current_info_need, bert_model, label_list, num_epochs, condition, folds=10, stratified=True, learning_rate=2e-5, batch_size=32, embeds_dropout_prob=.1): ## Define evaluation metrics ## def evaluation_metrics(preds, labels): acc = simple_accuracy(preds, labels).get("acc") f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other") f1infoneed = f1_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_infoneed = recall_score(y_true=labels, y_pred=preds, pos_label=current_info_need) precision_infoneed = precision_score(y_true=labels, y_pred=preds, pos_label=current_info_need) recall_other = recall_score(y_true=labels, y_pred=preds, pos_label="Other") precision_other = precision_score(y_true=labels, y_pred=preds, pos_label="Other") recall_macro = recall_score(y_true=labels, y_pred=preds, average="macro") precision_macro = precision_score(y_true=labels, y_pred=preds, average="macro") recall_micro = recall_score(y_true=labels, y_pred=preds, average="micro") precision_micro = precision_score(y_true=labels, y_pred=preds, average="micro") recall_weighted = recall_score(y_true=labels, y_pred=preds, average="weighted") precision_weighted = precision_score(y_true=labels, y_pred=preds, average="weighted") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="micro") mcc = matthews_corrcoef(labels, preds) f1weighted = f1_score(y_true=labels, y_pred=preds, average="weighted") return { "info_need": current_info_need, "model": bert_model, "num_epochs": num_epochs, "condition": condition, "acc": acc, "f1_other": f1other, "f1_infoneed": f1infoneed, "precision_infoneed": precision_infoneed, "recall_infoneed": recall_infoneed, "recall_other": recall_other, "precision_other": precision_other, "recall_macro": recall_macro, "precision_macro": precision_macro, "recall_micro": recall_micro, "precision_micro": precision_micro, "recall_weighted": recall_weighted, "precision_weighted": precision_weighted, "f1_weighted": f1weighted, "f1_macro": f1macro, "f1_micro": f1micro, "f1_weighted": f1weighted, "mcc": mcc } register_metrics( f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs', evaluation_metrics) metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs' set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) logger, ml_logger = init_logging() tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=256, train_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv", test_filename= f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv", data_dir="data/", label_list=label_list, metric=metric, text_column_name="utterance", label_column_name=level, delimiter=";") data_silo = DataSilo(processor=processor, batch_size=batch_size) silos = DataSiloForCrossVal.make(data_silo, n_splits=folds, sets=['train', 'test']) # the following steps should be run for each of the folds of the cross validation, so we put them # into a function def train_on_split(silo_to_use, n_fold, save_dir): logger.info( f"############ Crossvalidation: Fold {n_fold} ############") # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(bert_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout_prob, lm_output_types=["per_sequence"], device=device) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, device=device, n_batches=len(silo_to_use.loaders["train"]), n_epochs=num_epochs, use_amp=None) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. # NOTE: Using a different save directory for each fold, allows us afterwards to use the # nfolds best models in an ensemble! save_dir = Path(str(save_dir) + f"-{n_fold}") earlystopping = EarlyStopping( metric="f1_infoneed", mode= "max", # use the metric from our own metrics function instead of loss save_dir=save_dir, # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=num_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=100, device=device, early_stopping=earlystopping, evaluator_test=False) # train it trainer.train() return trainer.model # for each fold, run the whole training, earlystopping to get a model, then evaluate the model # on the test set of each fold # Remember all the results for overall metrics over all predictions of all folds and for averaging allresults = [] all_preds = [] all_labels = [] bestfold = None bestf1_info_need = -1 language_model_name = bert_model if language_model_name.find("/") != -1: language_model_name = language_model_name.replace("/", "_") save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}" ) for num_fold, silo in enumerate(silos): model = train_on_split(silo, num_fold, save_dir) # do eval on test set here (and not in Trainer), # so that we can easily store the actual preds and labels for a "global" eval across all folds. evaluator_test = Evaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=device) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) allresults.append(result) all_preds.extend(result[0].get("preds")) all_labels.extend(result[0].get("labels")) # keep track of best fold f1_info_need = result[0]["f1_infoneed"] if f1_info_need > bestf1_info_need: bestf1_info_need = f1_info_need bestfold = num_fold # emtpy cache to avoid memory leak and cuda OOM across multiple folds model.cpu() torch.cuda.empty_cache() # Save the per-fold results to json for a separate, more detailed analysis with open( f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json", "wt") as fp: json.dump(allresults, fp) # calculate overall metrics across all folds xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="Other") xval_f1_info_need = f1_score(all_labels, all_preds, labels=label_list, pos_label=current_info_need) xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro") xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro") xval_mcc = matthews_corrcoef(all_labels, all_preds) xval_overall_results = { "xval_f1_other": xval_f1_other, f"xval_f1_infoneed": xval_f1_info_need, "xval_f1_micro": xval_f1_micro, "xval_f1_macro": xval_f1_macro, "xval_f1_mcc": xval_mcc } logger.info(f"XVAL F1 MICRO: {xval_f1_micro}") logger.info(f"XVAL F1 MACRO: {xval_f1_macro}") logger.info(f"XVAL F1 OTHER: {xval_f1_other}") logger.info( f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs: {xval_f1_info_need}" ) logger.info(f"XVAL MCC: {xval_mcc}") # ----------------------------------------------------- # Just for illustration, use the best model from the best xval val for evaluation on # the original (still unseen) test set. logger.info( "###### Final Eval on hold out test set using best model #####") evaluator_origtest = Evaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # restore model from the best fold lm_name = model.language_model.name save_dir = Path( f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}" ) model = AdaptiveModel.load(save_dir, device, lm_name=lm_name) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) result = evaluator_origtest.eval(model) logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"])) logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"])) logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"])) logger.info("TEST F1 {0}: {1}".format(current_info_need, result[0]["f1_infoneed"])) logger.info("TEST MCC: {}".format(result[0]["mcc"])) test_set_results = { "test_f1_other": result[0]["f1_other"], "test_f1_infoneed": result[0][f"f1_infoneed"], "test_f1_micro": result[0]["f1_micro"], "test_f1_macro": result[0]["f1_macro"], "test_f1_mcc": result[0]["mcc"] }
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False dev_split = 0.1 dev_stratification = True max_processes = 1 # 128 is default # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, dev_split=dev_split, dev_stratification=dev_stratification, label_column_name="coarse_label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, max_processes=max_processes, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool()
def test_dpr_modules(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load( pretrained_model_name_or_path= "facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True) passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, max_seq_len_query=256, max_seq_len_passage=256, label_list=["hard_negative", "positive"], metric="text_similarity_metric", data_dir="data/retriever", train_filename="nq-train.json", dev_filename="nq-dev.json", test_filename="nq-dev.json", embed_title=True, num_hard_negatives=1) question_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0) passage_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0) prediction_head = TextSimilarityHead(similarity_function="dot_product") model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.0, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) model.connect_heads_with_processor(processor.tasks) assert type(model) == BiAdaptiveModel assert type(processor) == TextSimilarityProcessor assert type(question_language_model) == DPRQuestionEncoder assert type(passage_language_model) == DPRContextEncoder # check embedding layer weights assert list(model.named_parameters())[0][1][ 0, 0].item() - -0.010200000368058681 < 0.0001 d = { 'query': 'big little lies season 2 how many episodes', 'passages': [{ 'title': 'Big Little Lies (TV series)', 'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley', 'label': 'positive', 'external_id': '18768923' }, { 'title': 'Little People, Big World', 'text': 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend', 'label': 'hard_negative', 'external_id': '7459116' }, { 'title': 'Cormac McCarthy', 'text': 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young', 'label': 'negative', 'passage_id': '2145653' }] } dataset, tensor_names, _ = processor.dataset_from_dicts( dicts=[d], return_baskets=False) features = { key: val.unsqueeze(0).to(device) for key, val in zip(tensor_names, dataset[0]) } # test features assert torch.all( torch.eq( features["query_input_ids"][0][:10].cpu(), torch.tensor( [101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102]))) assert torch.all( torch.eq( features["passage_input_ids"][0][0][:10].cpu(), torch.tensor( [101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186]))) assert len(features["query_segment_ids"][0].nonzero()) == 0 assert len(features["passage_segment_ids"][0].nonzero()) == 0 assert torch.all( torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10))))) assert torch.all( torch.eq( features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127))))) assert torch.all( torch.eq( features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143))))) # test model encodings query_vector = model.language_model1(**features)[0] passage_vector = model.language_model2(**features)[0] assert torch.all( torch.le( query_vector[0, :10].cpu() - torch.tensor([ -0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405, 0.2285, 0.0893 ]), torch.ones((1, 10)) * 0.0001)) assert torch.all( torch.le( passage_vector[0, :10].cpu() - torch.tensor([ 0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969, -0.0555, 0.3405, -0.8691 ]), torch.ones((1, 10)) * 0.0001)) assert torch.all( torch.le( passage_vector[1, :10].cpu() - torch.tensor([ -0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306, 0.1156, 0.3350, -0.3412 ]), torch.ones((1, 10)) * 0.0001)) # test logits and loss embeddings = model(**features) query_emb, passage_emb = embeddings[0] assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu())) assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu())) loss = model.logits_to_loss_per_head(embeddings, **features) similarity_scores = model.prediction_heads[0]._embeddings_to_scores( query_emb, passage_emb).cpu() assert torch.all( torch.le( similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]), torch.ones((1, 2)) * 0.0001)) assert (loss[0].item() - 0.0018) <= 0.0001
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 5 batch_size = 2 evaluate_every = 1 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/ner"), train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ", label_list=ner_labels, metric="seq_f1") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(lang_model) prediction_head = TokenClassificationHead(num_labels=13) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'LinearWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) save_dir = Path("testsave/ner") model = trainer.train() model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Albrecht Lehman ist eine Person" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts, max_processes=1) #print(result) #assert result[0]["predictions"][0]["context"] == "sagte" #assert isinstance(result[0]["predictions"][0]["probability"], np.float32) result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True) assert result == result2
def train_evaluation_single(seed=42): ########################## ########## Settings ########################## set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others batch_size = 40 * 4 n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # Load model and train tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("testsave/data/squad20"), ) data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) language_model = LanguageModel.load(lang_model) prediction_head = QuestionAnsweringHead(n_best=5) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) starttime = time() trainer.train() elapsed = time() - starttime save_dir = Path("testsave/roberta-qa-dev") model.save(save_dir) processor.save(save_dir) # Create Evaluator evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 print(results) print(elapsed) gold_f1 = 82.155 gold_EM = 77.714 gold_tnrecall = 97.3721 # gold_elapsed = 1135 np.testing.assert_allclose( f1_score, gold_f1, rtol=0.01, err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose( em_score, gold_EM, rtol=0.01, err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") np.testing.assert_allclose( tnacc, gold_tnrecall, rtol=0.01, err_msg= f"FARM Training changed for top 5 accuracy by: {em_score - gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def test_doc_regression(data_dir_path, text_column_name, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) rp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") if text_column_name is not None: rp_params["text_column_name"] = text_column_name processor = RegressionProcessor(**rp_params) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'CosineWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_regr") model.save(save_dir) processor.save(save_dir) del model del processor del optimizer del data_silo del trainer basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir, num_processes=0) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32) del model
def test_evaluation(): ########################## ########## Settings ########################## lang_model = "deepset/roberta-base-squad2" do_lower_case = False test_assertions = True data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" device, n_gpu = initialize_device_settings(use_cuda=True) # loading models and evals model = AdaptiveModel.convert_from_transformers( lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=["start_token", "end_token"], metric="squad", train_filename=None, dev_filename=None, dev_split=0, test_filename=evaluation_filename, data_dir=data_dir, doc_stride=128, ) starttime = time() data_silo = DataSilo(processor=processor, batch_size=40 * 4) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None, distributed=False, use_amp=None) evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # 1. Test FARM internal evaluation results = evaluator.eval(model) f1_score = results[0]["f1"] * 100 em_score = results[0]["EM"] * 100 tnacc = results[0]["top_n_accuracy"] * 100 elapsed = time() - starttime print(results) print(elapsed) gold_EM = 77.7478 gold_f1 = 82.1557 gold_tnacc = 84.0646 # top 1 recall gold_elapsed = 40 # 4x V100 if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") np.testing.assert_allclose( tnacc, gold_tnacc, rtol=0.001, err_msg= f"FARM Eval changed for top 1 accuracy by: {em_score-gold_EM}") np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds" ) # # 2. Test FARM predictions with outside eval script starttime = time() model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40 * 4, gpu=device.type == "cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename, return_json=False, multiprocessing_chunksize=80) results_squad = [x.to_squad_eval() for x in result] elapsed = time() - starttime os.makedirs("../testsave", exist_ok=True) write_squad_predictions(predictions=results_squad, predictions_filename=filename, out_filename="testsave/predictions.json") script_params = { "data_file": filename, "pred_file": "testsave/predictions.json", "na_prob_thresh": 1, "na_prob_file": False, "out_file": False } results_official = squad_evaluation.main(OPTS=DotMap(script_params)) f1_score = results_official["f1"] em_score = results_official["exact"] gold_EM = 78.4890 gold_f1 = 81.7104 gold_elapsed = 27 # 4x V100 print(elapsed) if test_assertions: np.testing.assert_allclose( em_score, gold_EM, rtol=0.001, err_msg= f"Eval with official script changed for EM by: {em_score - gold_EM}" ) np.testing.assert_allclose( f1_score, gold_f1, rtol=0.001, err_msg= f"Eval with official script changed for f1 score by: {f1_score - gold_f1}" ) np.testing.assert_allclose( elapsed, gold_elapsed, rtol=0.1, err_msg= f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds" )
def __init__(self, model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', 'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. See https://huggingface.co/models for full list of available models. :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param context_window_size: The size, in characters, of the window around the answer span that is used when displaying the context around the answer. :param batch_size: Number of samples the model receives in one batch for inference. Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used. :param use_gpu: Whether to use GPU (if available) :param no_ans_boost: How much the no_answer logit is boosted/increased. If set to 0 (default), the no_answer logit is not changed. If a negative number, there is a lower chance of "no_answer" being predicted. If a positive number, there is an increased chance of "no_answer" :param return_no_answer: Whether to include no_answer predictions in the results. :param top_k: The maximum number of answers to return :param top_k_per_candidate: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param top_k_per_sample: How many answers to extract from each small text passage that the model can process at once (one "candidate doc" is usually split into many smaller "passages"). You usually want a very small value here, as it slows down inference and you don't gain much of quality by having multiple answers from one passage. Note that this is not the number of "final answers" you will receive (see `top_k` in FARMReader.predict() or Finder.get_answers() for that) and that FARM includes no_answer in the sorted list of predictions. :param num_processes: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable multiprocessing. Set to None to let Inferencer determine optimum number. If you want to debug the Language Model, you might need to disable multiprocessing! :param max_seq_len: Max sequence length of one input text for the model :param doc_stride: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered. The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. """ # save init parameters to enable export of component config as YAML self.set_config(model_name_or_path=model_name_or_path, model_version=model_version, context_window_size=context_window_size, batch_size=batch_size, use_gpu=use_gpu, no_ans_boost=no_ans_boost, return_no_answer=return_no_answer, top_k=top_k, top_k_per_candidate=top_k_per_candidate, top_k_per_sample=top_k_per_sample, num_processes=num_processes, max_seq_len=max_seq_len, doc_stride=doc_stride, progress_bar=progress_bar, duplicate_filtering=duplicate_filtering, use_confidence_scores=use_confidence_scores) self.return_no_answers = return_no_answer self.top_k = top_k self.top_k_per_candidate = top_k_per_candidate self.inferencer = QAInferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, task_type="question_answering", max_seq_len=max_seq_len, doc_stride=doc_stride, num_processes=num_processes, revision=model_version, disable_tqdm=not progress_bar, strict=False) self.inferencer.model.prediction_heads[ 0].context_window_size = context_window_size self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost self.inferencer.model.prediction_heads[ 0].n_best = top_k_per_candidate + 1 # including possible no_answer try: self.inferencer.model.prediction_heads[ 0].n_best_per_sample = top_k_per_sample except: logger.warning( "Could not set `top_k_per_sample` in FARM. Please update FARM version." ) try: self.inferencer.model.prediction_heads[ 0].duplicate_filtering = duplicate_filtering except: logger.warning( "Could not set `duplicate_filtering` in FARM. Please update FARM version." ) self.max_seq_len = max_seq_len self.use_gpu = use_gpu self.progress_bar = progress_bar self.device, _ = initialize_device_settings(use_cuda=self.use_gpu) self.use_confidence_scores = use_confidence_scores
def test_doc_classification(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "distilbert-base-german-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_class"), train_filename=Path("train-sample.tsv"), label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = DistilBert.load(lang_model) prediction_head = TextClassificationHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts=None) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = Path("testsave/doc_class") model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Malte liebt Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="SQuAD", run_name="qa_albert") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 32 n_epochs = 2 evaluate_every = 1 base_LM_model = "albert" train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" save_dir = "../saved_models/qa_medium_albert" inference_file = "../data/squad20/subsets/5ad3ff1b604f3c001a3ffc74.json" predictions_file = save_dir + "/predictions.json" full_predictions_file = save_dir + "/full_predictions.json" inference_multiprocessing = False train = False inference = True if train:
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ###### ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset. # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label" processor = TextPairClassificationProcessor( tokenizer=tokenizer, label_list=label_list, metric="f1_macro", max_seq_len=128, dev_filename="dev.tsv", test_filename=None, data_dir=Path("../data/asnq_binary"), delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-6, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/text_pair_classification_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide basic_texts = [ { "text": "how many times have real madrid won the champions league in a row", "text_b": "They have also won the competition the most times in a row, winning it five times from 1956 to 1960" }, { "text": "how many seasons of the blacklist are there on netflix", "text_b": "Retrieved March 27 , 2018 ." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def test_qa(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model, do_lower_case=True) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir=Path("samples/qa"), label_list=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/qa") model.save(save_dir) processor.save(save_dir) inferencer = Inferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0) qa_format_1 = [{ "questions": ["Who counted the game among the best ever made?"], "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] qa_format_2 = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.", }] result1 = inferencer.inference_from_dicts(dicts=qa_format_1) result2 = inferencer.inference_from_dicts(dicts=qa_format_2) assert result1 == result2
def train( self, data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_file_name: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: Optional[float] = 0.1, evaluate_every: int = 300, save_dir: Optional[str] = None, ): """ Fine-tune a model on a QA dataset. Options: - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) :param data_dir: Path to directory containing your training data in SQuAD style :param train_filename: filename of training data :param dev_filename: filename of dev / eval data :param test_file_name: filename of test data :param dev_split: Instead of specifying a dev_filename you can also specify a ratio (e.g. 0.1) here that get's split off from training data for eval. :param use_gpu: Whether to use GPU (if available) :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: number of iterations on the whole training data set :param learning_rate: learning rate of the optimizer :param max_seq_len: maximum text length (in tokens). Everything longer gets cut down. :param warmup_proportion: Proportion of training steps until maximum learning rate is reached. Until that point LR is increasing linearly. After that it's decreasing again linearly. Options for different schedules are available in FARM. :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset :param save_dir: Path to store the final model :return: None """ if dev_filename: dev_split = None set_all_seeds(seed=42) # For these variables, by default, we use the value set when initializing the FARMReader. # These can also be manually set when train() is called if you want a different value at train vs inference if use_gpu is None: use_gpu = self.use_gpu if max_seq_len is None: max_seq_len = self.max_seq_len device, n_gpu = initialize_device_settings(use_cuda=use_gpu) if not save_dir: save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}" # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=self.inferencer.processor.tokenizer, max_seq_len=max_seq_len, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, dev_split=dev_split, test_filename=test_file_name, data_dir=Path(data_dir), ) # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 5. Let it grow! self.inferencer.model = trainer.train() self.save(Path(save_dir))
def train_from_scratch(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug") ######################### ######## Settings ######################## set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True) evaluate_every = 5000 vocab_size = 30522 # dev_filename = None save_dir = Path("saved_models/train_from_scratch") n_epochs = 10 learning_rate = 1e-4 warmup_proportion = 0.05 batch_size = 16 # (probably only possible via gradient accumulation steps) max_seq_len = 64 # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor( data_dir=Path("data/lm_finetune_nips"), tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename="train.txt", dev_split=2000 / 8_000_000, dev_filename=None, test_filename=None, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, grad_acc_steps=8, ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=8, checkpoint_root_dir=Path( "saved_models/train_from_scratch/checkpoints"), ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = RegressionProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir="../data/<YOUR-DATASET>",
def test_doc_regression(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_regr", train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead(layer_dims=[768, 1]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_regr" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ { "text": "a man is walking on the street." }, { "text": "a woman is walking on the street." }, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == [ 'a', 'man', 'is', 'walking', 'on', 'the', 'street', '.' ] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6