def import_downstream_models(): ####################### loads a SQUAD finetuned model # saves it as a FARM adaptive model device, n_gpu = initialize_device_settings(use_cuda=True) model = "bert-large-uncased-whole-word-masking-finetuned-squad" save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad" lm = Bert.load(model) ph = QuestionAnsweringHead.load(model) am = AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) am.save(save_dir) # saves the processor associated with it, so you can use it in inference mode # TODO load HF's tokenizer_config.json and adjust settings tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=model) label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="../data/squad20", ) processor.save(save_dir)
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=False ) label_list = ["start_token", "end_token"] processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", label_list=label_list, metric="squad" ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir)
def distilbert_nq(caplog=None): if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "distilbert-base-uncased" tokenizer = Tokenizer.load( pretrained_model_name_or_path=base_LM_model, do_lower_case=True ) processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=20, doc_stride=10, max_query_length=6, train_filename="train_sample.jsonl", dev_filename="dev_sample.jsonl", data_dir=Path("samples/nq") ) data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) language_model = LanguageModel.load(base_LM_model) qa_head = QuestionAnsweringHead() classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device ) trainer.train() return model, processor
def test_s3e_fit(): # small test data language_model = Path("samples/s3e/tiny_fasttext_model") corpus_path = Path("samples/s3e/tiny_corpus.txt") save_dir = Path("testsave/fitted_s3e/") do_lower_case = False batch_size = 2 use_gpu = False # Fit S3E on a corpus set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(language_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=[], device=device) model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, model=model, corpus=corpus_path, n_clusters=3, pca_n_components=30, svd_postprocessing=True, min_token_occurrences=1) # save everything to allow inference without fitting everything again model.save(save_dir) processor.save(save_dir) with open(save_dir / "s3e_stats.pkl", "wb") as f: pickle.dump(s3e_stats, f) # Load model, tokenizer and processor directly into Inferencer inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, s3e_stats=s3e_stats, num_processes=0) # Input basic_texts = [ {"text": "a man is walking on the street."}, {"text": "a woman is walking on the street."}, ] # Get embeddings for input text (you can vary the strategy and layer) result = inferencer.inference_from_dicts(dicts=basic_texts) assert result[0]["context"] == basic_texts[0]["text"] assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 5 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=64, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir)
def embedding_extraction(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ########################## ########## Settings ########################## set_all_seeds(seed=42) # load from a local path: #lang_model = Path("../saved_models/glove-german-uncased") # or through s3 lang_model = "glove-german-uncased" #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported do_lower_case = True use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # Create a InferenceProcessor tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # Create an AdaptiveModel language_model = LanguageModel.load(lang_model) model = AdaptiveModel(language_model=language_model, prediction_heads=[], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # Create Inferencer for embedding extraction inferencer = Inferencer(model=model, processor=processor, task_type="embeddings") # Extract vectors basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] result = inferencer.extract_vectors(dicts=basic_texts, extraction_strategy="cls_token", extraction_layer=-1) print(result) inferencer.close_multiprocessing_pool()
def test_multiple_prediction_heads(): model = "bert-base-german-cased" lm = LanguageModel.load(model) ph1 = TextClassificationHead(num_labels=3, label_list=["negative", "neutral", "positive"]) ph2 = TokenClassificationHead(num_labels=3, label_list=["PER", "LOC", "ORG"]) adaptive_model = AdaptiveModel(language_model=lm, prediction_heads=[ph1, ph2], embeds_dropout_prob=0.1, lm_output_types="per_token", device="cpu") transformer_models = Converter.convert_to_transformers(adaptive_model) assert isinstance(transformer_models[0], BertForSequenceClassification) assert isinstance(transformer_models[1], BertForTokenClassification) del lm del transformer_models del adaptive_model
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None): """This is a regression test for #428 and #422.""" if caplog: caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 1 lang_model = "bert-base-german-cased" data_dir_path = "samples/doc_class" tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) tcp_params = dict(tokenizer=tokenizer, max_seq_len=8, data_dir=Path(data_dir_path), train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") processor = TextClassificationProcessor(**tcp_params) data_silo = DataSilo( processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) prediction_head = TextClassificationHead( num_labels=2, class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) model.save(tmp_path) model_loaded = AdaptiveModel.load(tmp_path, device='cpu') assert model_loaded is not None
def train_on_split(silo_to_use, n_fold, save_dir, dev): # Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead( # there is still an error with class weights ... # class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=dev) # Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=dev, n_batches=len(silo_to_use.loaders["train"]), n_epochs=n_epochs) # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer save_dir = Path(str(save_dir) + f"-{n_fold}") # unfortunately, early stopping is still not working earlystopping = EarlyStopping( metric="f1_macro", mode="max", save_dir=save_dir, # where to save the best model patience=5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=silo_to_use, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=dev, evaluator_test=False, #early_stopping=earlystopping) ) # train it trainer.train() trainer.model.save(save_dir) return trainer.model
def build_model(n_batches, device, n_epochs, task_data): language_model = LanguageModel.load(task_data["lang_model"]) prediction_head = TokenClassificationHeadPredictSequence( num_labels=task_data["num_labels"]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=n_batches, n_epochs=n_epochs, device=device, ) return lr_schedule, model, optimizer
def get_adaptive_model( lm_output_type, prediction_heads, layer_dims, model, device, embeds_dropout_prob, local_rank, n_gpu, fp16=False, class_weights=None, ): parsed_lm_output_types = lm_output_type.split(",") initialized_heads = [] for head_name in prediction_heads.split(","): initialized_heads.append( PredictionHead.create( prediction_head_name=head_name, layer_dims=layer_dims, class_weights=class_weights, )) language_model = LanguageModel.load(model) # TODO where are balance class weights? model = AdaptiveModel( language_model=language_model, prediction_heads=initialized_heads, embeds_dropout_prob=embeds_dropout_prob, lm_output_types=parsed_lm_output_types, device=device, ) if fp16: model.half() if local_rank > -1: model = WrappedDDP(model) elif n_gpu > 1: model = WrappedDataParallel(model) return model
def embeddings_extraction(): ########################## ########## Settings ########################## set_all_seeds(seed=42) batch_size = 32 use_gpu = True device, n_gpu = initialize_device_settings(use_cuda=use_gpu) lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing) processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) # 4. Create an AdaptiveModel with a pretrained language model as a basis language_model = LanguageModel.load(lang_model) adaptive_model = AdaptiveModel( language_model=language_model, prediction_heads=[], embeds_dropout_prob=0, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Extract embeddings with model in inference mode basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist" }, { "text": "Martin Müller spielt Fussball" }, ] model = Inferencer(adaptive_model, processor, gpu=use_gpu) result = model.extract_vectors(dicts=basic_texts) print(result)