def import_downstream_models(): ####################### loads a SQUAD finetuned model # saves it as a FARM adaptive model device, n_gpu = initialize_device_settings(use_cuda=True) model = "bert-large-uncased-whole-word-masking-finetuned-squad" save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad" lm = Bert.load(model) ph = QuestionAnsweringHead.load(model) am = AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) am.save(save_dir) # saves the processor associated with it, so you can use it in inference mode # TODO load HF's tokenizer_config.json and adjust settings tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=model) label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="../data/squad20", ) processor.save(save_dir)
def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 5 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=64, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir)
# 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model,
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 30 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = GermEval18CoarseProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir="samples/doc_class", train_filename="train-sample.tsv", test_filename=None) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_examples=data_silo.n_samples("train"), batch_size=batch_size, n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.5358161) <= 0.0001
def test_lm_finetuning(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False, never_split_chars=["-", "_"]) processor = BertStyleLMProcessor( data_dir="samples/lm_finetuning", train_filename="train-sample.txt", test_filename="test-sample.txt", dev_filename=None, tokenizer=tokenizer, max_seq_len=12, ) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) lm_prediction_head = BertLMHead.load(lang_model) next_sentence_head = NextSentenceHead.load(lang_model) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/lm_finetuning" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Farmer's life is great." }, { "text": "It's nothing for big city kids though." }, ] model = Inferencer.load(save_dir, embedder_only=True) result = model.extract_vectors(dicts=basic_texts) assert result[0]["context"] == [ 'Farmer', "'", 's', 'life', 'is', 'great', '.' ] assert result[0]["vec"].shape == (768, ) # TODO check why results vary accross runs with same seed assert isinstance(result[0]["vec"][0], np.float32)
def test_qa(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 2 n_epochs = 1 evaluate_every = 4 base_LM_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=base_LM_model, do_lower_case=False) label_list = ["start_token", "end_token"] processor = SquadProcessor(tokenizer=tokenizer, max_seq_len=16, max_query_length=4, train_filename="train-sample.json", dev_filename="dev-sample.json", test_filename=None, data_dir="samples/qa", labels=label_list, metric="squad") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(base_LM_model) prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=1e-5, warmup_proportion=0.2, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) model = trainer.train(model) save_dir = "testsave/qa" model.save(save_dir) processor.save(save_dir) QA_input = [{ "questions": ["In what country is Normandy located?"], "text": 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', }] model = Inferencer.load(save_dir) result = model.run_inference(dicts=QA_input) assert isinstance(result[0]["predictions"][0]["end"], int)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor(data_dir="../data/finetune_sample", tokenizer=tokenizer, max_seq_len=128) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=32) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load("bert-base-german-cased") # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead( embeddings=language_model.model.embeddings, hidden_size=language_model.model.config.hidden_size, ) next_sentence_head = TextClassificationHead( layer_dims=[language_model.model.config.hidden_size, 2], loss_ignore_index=-1) model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device,
def test_doc_regression(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 30 lang_model = "bert-base-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = RegressionProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_regr", columns=["text", "label"], label_list=[], metrics=["mse"], train_filename="train-sample.tsv", test_filename=None) data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = RegressionHead(layer_dims=[768, 1]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_regr" model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand." }, { "text": "it just did not fit right. The top is very thin showing everything." }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) print(result) assert abs(float(result[0]["predictions"][0]["pred"]) - 4.2121115) <= 0.0001 assert abs(float(result[0]["predictions"][1]["pred"]) - 4.1987348) <= 0.0001
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 5 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }, { "text": "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar" }, { "text": "Neues Video von Designern macht im Netz die Runde" }, { "text": "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden" }, { "text": "Aufständische verwendeten Chemikalie bei Gefechten im August." }, { "text": "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung" }, { "text": "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt" }, { "text": "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben" }, { "text": "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht" }] #TODO enable loading here again after we have finished migration towards "processor.tasks" #inf = Inferencer.load(save_dir) inf = Inferencer(model=model, processor=processor) result = inf.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["label"] == "OTHER" assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1 loaded_processor = TextClassificationProcessor.load_from_dir(save_dir) inf2 = Inferencer(model=model, processor=loaded_processor) result_2 = inf2.run_inference(dicts=basic_texts) pprint(list(zip(result, result_2))) for r1, r2 in list(zip(result, result_2)): assert r1 == r2 # if(__name__=="__main__"): # test_doc_classification()
tokenizer=tokenizer, max_seq_len=256, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir="../data/squad20", ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = Bert.load(base_LM_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer optimizer, warmup_linear = initialize_optimizer( model=model,
def test_doc_classification(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 1 evaluate_every = 2 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=8, data_dir="samples/doc_class", train_filename="train-sample.tsv", label_list=["OTHER", "OFFENSE"], metric="f1_macro", dev_filename="test-sample.tsv", test_filename=None, dev_split=0.0, label_column_name="coarse_label") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TextClassificationHead(layer_dims=[ 768, len(processor.tasks["text_classification"]["label_list"]) ]) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=1) trainer = Trainer(optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device) model = trainer.train(model) save_dir = "testsave/doc_class" model.save(save_dir) processor.save(save_dir) basic_texts = [{ "text": "Martin Müller spielt Handball in Berlin." }, { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei." }] inf = Inferencer.load(save_dir, batch_size=2) result = inf.run_inference(dicts=basic_texts) assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import Bert from farm.modeling.tokenization import BertTokenizer from farm.modeling.prediction_head import QuestionAnsweringHead from farm.data_handler.processor import SquadProcessor from farm.utils import initialize_device_settings ####################### loads a SQUAD finetuned model # saves it as a FARM adaptive model device, n_gpu = initialize_device_settings(use_cuda=True) model = "bert-large-uncased-whole-word-masking-finetuned-squad" save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad" lm = Bert.load(model) ph = QuestionAnsweringHead.load(model) am = AdaptiveModel(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1, lm_output_types="per_token", device=device) am.save(save_dir) # saves the processor associated with it, so you can use it in inference mode # TODO load HF's tokenizer_config.json and adjust settings tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model) label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=256, label_list=label_list, metric=metric, data_dir="../data/squad20",
def test_ner(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 1 batch_size = 8 evaluate_every = 50 lang_model = "bert-base-german-cased" tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir="samples/ner", train_filename="train-sample.txt", dev_filename="dev-sample.txt", test_filename=None, delimiter=" ") data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = Bert.load(lang_model) prediction_head = TokenClassificationHead( layer_dims=[768, len(processor.label_list)]) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=2e-5, warmup_proportion=0.1, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, warmup_linear=warmup_linear, evaluate_every=evaluate_every, device=device, ) save_dir = "testsave/ner" model = trainer.train(model) model.save(save_dir) processor.save(save_dir) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, ] model = Inferencer.load(save_dir) result = model.run_inference(dicts=basic_texts) assert result[0]["predictions"][0]["context"] == "sagte" assert abs(result[0]["predictions"][0]["probability"] - 0.213869) <= 0.0001