def load_prediction_heads(args, silo): if args.recycle_heads: args.logger.info("Recycling heads of the loaded model") # Model name should be a directory in this case _, ph_configs = CustomAdaptiveModel._get_prediction_head_files( args.model_name) prediction_heads = [ PredictionHead.load(config_file) for config_file in ph_configs ] # Ensure that label_columns order is the same as respective prediction heads (ascending) # else this will misalign heads with tasks. for idx in range(len(prediction_heads)): args.logger.info( f"Renaming head task {prediction_heads[idx].task_name} to {args.label_columns[idx]}" ) prediction_heads[idx].task_name = args.label_columns[idx] out_types = [head.ph_output_type for head in prediction_heads] elif args.train_mode == "classification": prediction_heads = [ TextClassificationHead( layer_dims=[ args.heads_dim, len(get_labels(args.data_dir, task)) ], task_name=task, ) for task in args.label_columns ] out_types = ["per_sequence" for _ in args.label_columns] else: # Regression from raw heads if args.do_feat_embeds: args.logger.info(f"feat_size: {args.feat_size}") prediction_heads = [ FeaturesRegressionHead( layer_dims=[args.heads_dim + args.feat_size, 1], task_name=task) for task in args.label_columns ] else: prediction_heads = [ RegressionHead(layer_dims=[args.heads_dim, 1], task_name=task) for task in args.label_columns ] out_types = ["per_sequence_continuous" for _ in args.label_columns] return prediction_heads, out_types
def test_doc_regression(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir=Path("samples/doc_regr"), train_filename="train-sample.tsv", dev_filename="test-sample.tsv", test_filename=None, label_column_name="label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=1, device=device, schedule_opts={ 'name': 'CosineWarmup', 'warmup_proportion': 0.1 }) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/doc_regr") model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/<YOUR-DATASET>", label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text regression prediction_head = RegressionHead(layer_dims=[768, 1]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs)
def test_doc_regression(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 30 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_regr", columns=["text", "label"], label_list=[], metrics=["mse"], train_filename="train-sample.tsv", test_filename=None) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = RegressionHead(layer_dims=[768, 1]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_regr" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) print(result) assert abs(float(result[0]["predictions"][0]["pred"]) - 4.2121115) <= 0.0001 assert abs(float(result[0]["predictions"][1]["pred"]) - 4.1987348) <= 0.0001
processor = TextPairRegressionProcessor( tokenizer=tokenizer, label_list=None, metric="pearson_correlation", max_seq_len=multitransquest_config['max_seq_length'], train_filename="train.tsv", dev_filename="eval.tsv", test_filename=None, data_dir=Path(multitransquest_config['cache_dir']), delimiter="\t") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=multitransquest_config['learning_rate'], device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
def convert_from_transformers(model_name_or_path, device, revision=None, task_type=None, processor=None, **kwargs): """ Load a (downstream) model from huggingface's transformers format. Use cases: - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data) - compare models without switching frameworks - use model directly for inference :param model_name_or_path: local path of a saved model or name of a public one. Exemplary public names: - distilbert-base-uncased-distilled-squad - deepset/bert-large-uncased-whole-word-masking-squad2 See https://huggingface.co/models for full list :param device: "cpu" or "cuda" :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param task_type: One of : - 'question_answering' - 'text_classification' - 'embeddings' More tasks coming soon ... :param processor: populates prediction head with information coming from tasks :type processor: Processor :return: AdaptiveModel """ lm = LanguageModel.load(model_name_or_path, revision=revision, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] if "MaskedLM" in architecture: task_type = "lm" elif "QuestionAnswering" in architecture: task_type = "question_answering" elif "SequenceClassification" in architecture: if lm.model.config.num_labels == 1: task_type = "regression" else: task_type = "text_classification" elif "TokenClassification" in architecture: task_type = "ner" else: logger.error( "Could not infer task type from model config. Please provide task type manually. " "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')" ) if task_type == "lm": ph = BertLMHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "question_answering": ph = QuestionAnsweringHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "regression": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Regression with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = RegressionHead.load(model_name_or_path, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "text_classification": if "roberta" in model_name_or_path: # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads. logger.error( "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment." ) raise NotImplementedError ph = TextClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_sequence", device=device) elif task_type == "ner": ph = TokenClassificationHead.load(model_name_or_path, revision=revision, **kwargs) adaptive_model = am.AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) elif task_type == "embeddings": adaptive_model = am.AdaptiveModel( language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device) if processor: adaptive_model.connect_heads_with_processor(processor.tasks) return adaptive_model
def test_text_pair_regression(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) ########################## ########## Settings ###### ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 5 evaluate_every = 2 lang_model = "microsoft/MiniLM-L12-H384-uncased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model) processor = TextPairRegressionProcessor(tokenizer=tokenizer, label_list=None, metric="f1_macro", max_seq_len=128, train_filename="sample.tsv", dev_filename="sample.tsv", test_filename=None, data_dir=Path("samples/text_pair"), delimiter="\t") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = RegressionHead() model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) trainer.train() save_dir = Path("testsave/text_pair_regression_model") model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": ("how many times have real madrid won the champions league in a row", "They have also won the competition the most times in a row, winning it five times from 1956 to 1960" ) }, { "text": ("how many seasons of the blacklist are there on netflix", "Retrieved March 27 , 2018 .") }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) assert np.isclose(result[0]["predictions"][0]["pred"], 0.7976, rtol=0.05) model.close_multiprocessing_pool()
def doc_regression(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # We do not have a sample dataset for regression yet, add your own dataset to run the example processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/<YOUR-DATASET>"), label_column_name="label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text regression prediction_head = RegressionHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-regression-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide basic_texts = [ {"text": ""}, {"text": ""}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)