def test_loading_adapter_weights_without_prefix(self): model_base, model_with_head_base = create_twin_models( AutoModel, self.config) model_with_head = AutoModelWithHeads.from_config( model_with_head_base.config) setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base) model_base.add_adapter("dummy") with tempfile.TemporaryDirectory() as temp_dir: model_base.save_adapter(temp_dir, "dummy") loading_info = {} model_with_head.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # check equal output input_ids = self.get_input_samples((1, 128), config=model_with_head.config) output1 = model_with_head(input_ids) output2 = model_base(input_ids) self.assertEqual(len(output1), len(output2)) self.assertTrue(torch.equal(output1[0], output2[0]))
def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) # add two adapters: one will be trained and the other should be frozen model.add_adapter("mrpc") model.add_adapter("dummy") model.add_classification_head("mrpc") self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("dummy", model.config.adapters.adapters) # train the mrpc adapter -> should be activated & unfreezed model.train_adapter("mrpc") self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.dummy.").items(): self.assertFalse(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments(output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_parallel_inference_with_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("a") model.add_adapter("b") model.add_classification_head("a", num_labels=2) model.add_classification_head("b", num_labels=3) model.eval() inputs = {} inputs["attention_mask"] = torch.randint(0, 2, size=(2, 128)) inputs["input_ids"] = self.get_input_samples((2, 128), config=model.config) # for reference, pass through single adapters model.active_adapters = "a" model.active_head = "a" outputs_a = model(**inputs) model.active_adapters = "b" model.active_head = "b" outputs_b = model(**inputs) model.active_adapters = Parallel("a", "b") # active_adapters should set parallel heads too self.assertEqual(model.active_head, ["a", "b"]) outputs = model(**inputs) self.assertEqual(len(outputs), 2) self.assertEqual(outputs[0][0].shape, (2, 2)) self.assertEqual(outputs[1][0].shape, (2, 3)) self.assertTrue(torch.allclose(outputs[0][0], outputs_a[0])) self.assertTrue(torch.allclose(outputs[1][0], outputs_b[0]))
def test_loading_adapter_weights_without_prefix(self): if self.config_class not in MODEL_WITH_HEADS_MAPPING: self.skipTest("Does not support flex heads.") model_base, model_with_head_base = create_twin_models( self.model_class, self.config) model_with_head = AutoModelWithHeads.from_config( model_with_head_base.config) setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base) model_base.add_adapter("dummy") with tempfile.TemporaryDirectory() as temp_dir: model_base.save_adapter(temp_dir, "dummy") loading_info = {} model_with_head.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # check equal output input_data = self.get_input_samples((1, 128), config=model_with_head.config) output1 = model_with_head(**input_data) output2 = model_base(**input_data) self.assertEqual(len(output1), len(output2)) self.assertTrue(torch.equal(output1[0], output2[0]))
def test_train_adapter_fusion(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) self.add_head(model, "head") # add the adapters to be fused model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") self.assertIn("a", model.config.adapters.adapters) self.assertIn("b", model.config.adapters.adapters) self.assertIn("c", model.config.adapters.adapters) # setup fusion adapter_setup = Fuse("a", "b", "c") model.add_adapter_fusion(adapter_setup) model.train_adapter_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) # all weights of the adapters should be frozen (test for one) for k, v in filter_parameters(model, "adapters.a.").items(): self.assertFalse(v.requires_grad, k) # all weights of the fusion layer should be activated for k, v in filter_parameters(model, "adapter_fusion_layer").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # Since our config has a value matrix, make sure it is regularized. # We do this by patching the fusion regularization function. regularization_called = False orig_fusion_regularization_loss = model.base_model.get_fusion_regularization_loss def patched_fusion_reg_loss(): nonlocal regularization_called regularization_called = True return orig_fusion_regularization_loss() model.base_model.get_fusion_regularization_loss = patched_fusion_reg_loss self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if ("adapter_fusion_layer" in k1 or "classifier" in k1 or "classification_head" in k1 or "score" in k1 or "heads" in k1): self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2), k1) self.assertTrue(regularization_called)
def test_load_full_model(self): model = AutoModelWithHeads.from_config(self.config()) model.add_classification_head("dummy", layers=1) true_config = model.get_prediction_heads_config() with tempfile.TemporaryDirectory() as temp_dir: # save model.save_pretrained(temp_dir) # reload model = AutoModelWithHeads.from_pretrained(temp_dir) self.assertIn("dummy", model.heads) self.assertDictEqual(true_config, model.get_prediction_heads_config())
def test_batch_split_head(self): if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"): self.skipTest("No classification head available") model = AutoModelWithHeads.from_config(self.config()) model.add_classification_head("a") model.add_classification_head("b") model.active_head = BatchSplit("a", "b", batch_sizes=[1, 2]) in_data = self.get_input_samples((3, 128), config=model.config) out = model(**in_data) self.assertEqual(2, len(out)) self.assertEqual((1, 2), out[0][0].shape) self.assertEqual((2, 2), out[1][0].shape)
def test_delete_head(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() name = "test_head" self.add_head(model, name) self.assertTrue(name in model.heads) self.assertTrue(name in model.config.prediction_heads) self.assertEqual(name, model.active_head) model.delete_head(name) self.assertFalse(name in model.heads) self.assertFalse(name in model.config.prediction_heads) self.assertNotEqual(name, model.active_head)
def test_batch_split_adapter_head(self): model = AutoModelWithHeads.from_config(self.config()) self.add_head(model, "a") self.add_head(model, "b") model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") model.set_active_adapters( BatchSplit(Stack("c", "a"), "b", batch_sizes=[2, 1])) in_data = self.get_input_samples((3, 128), config=model.config) out = model(**in_data) self.assertEqual(2, len(out)) self.assertTrue(isinstance(model.active_head, BatchSplit))
def test_invertible_adapter_with_head(self): if hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"): lm_head = "masked_lm" elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"): lm_head = "casual_lm" elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"): lm_head = "seq2seq_lm" else: self.skipTest("No masked or causel language model head") model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("test", config="pfeiffer+inv") if lm_head == "casual_lm": model.add_causal_lm_head("test") elif lm_head == "masked_lm": model.add_masked_lm_head("test") elif lm_head == "seq2seq_lm": model.add_seq2seq_lm_head("test") else: raise RuntimeError("{} is not a valid lm head".format(lm_head)) model.set_active_adapters("test") # Set a hook before the invertible adapter to make sure it's actually called twice: # Once after the embedding layer and once in the prediction head. calls = 0 def forward_pre_hook(module, input): nonlocal calls calls += 1 inv_adapter = model.base_model.get_invertible_adapter() self.assertIsNotNone(inv_adapter) inv_adapter.register_forward_pre_hook(forward_pre_hook) in_data = self.get_input_samples((self.batch_size, self.seq_length), config=model.config) out = model(**in_data) self.assertEqual( (self.batch_size, self.seq_length, model.config.vocab_size), out[0].shape) self.assertEqual(2, calls)
def test_parallel_inference_with_wrong_number_of_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() model.add_adapter("a") model.add_adapter("b") self.add_head(model, "a", num_labels=2) inputs = self.get_input_samples((2, 128), config=model.config) model.active_adapters = Parallel("a", "b") model.active_head = ["a"] with self.assertRaises(ValueError): model(**inputs) model.active_head = "a" with self.assertRaises(ValueError): model(**inputs)
def test_parallel_training(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("mrpc1") model.add_adapter("mrpc2") self.add_head(model, "mrpc1", num_labels=2) self.add_head(model, "mrpc2", num_labels=3) model.active_adapters = Parallel("mrpc1", "mrpc2") model.train_adapter(Parallel("mrpc1", "mrpc2")) # model.eval() # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc1.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.mrpc2.").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) train_dataset = self.dataset(tokenizer) training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=10, no_cuda=True ) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2))
def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) # add two adapters: one will be trained and the other should be frozen model.add_adapter("mrpc") model.add_adapter("dummy") self.add_head(model, "mrpc") self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("dummy", model.config.adapters.adapters) # train the mrpc adapter -> should be activated & unfreezed model.train_adapter("mrpc") self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.dummy.").items(): self.assertFalse(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_batch_split_with_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("a") model.add_adapter("b") self.add_head(model, "a", num_labels=2) self.add_head(model, "b", num_labels=3) model.eval() inputs = {"input_ids": self.get_input_samples((2, 128), config=model.config)["input_ids"]} if isinstance(model, T5ModelWithHeads): inputs["decoder_input_ids"] = inputs["input_ids"] # for reference, pass through single adapters model.active_adapters = "a" model.active_head = "a" outputs_a = model(**{k: v[:1] for k, v in inputs.items()}) model.active_adapters = "b" model.active_head = "b" outputs_b = model(**{k: v[1:] for k, v in inputs.items()}) model.set_active_adapters(BatchSplit("a", "b", batch_sizes=[1, 1])) output = model(**inputs) self.assertEqual(2, len(output)) self.assertTrue( torch.allclose( output[0]["logits"], outputs_a["logits"], atol=1e-05, ) ) self.assertTrue( torch.allclose( output[1]["logits"], outputs_b["logits"], atol=1e-05, ) )
def test_parallel_training_single_forward_pass(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() a1, a2 = self.create_twin_adapters(model, "a") b1, b2 = self.create_twin_adapters(model, "b") state_dict = model.state_dict() for k, v in state_dict.items(): if a1 in k: self.assertTrue(torch.equal(v, state_dict[k.replace(a1, a2)])) if b1 in k: self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)])) input_data = self.get_input_samples((3, 128), config=model.config) if isinstance(model, T5ModelWithHeads): input_data["labels"] = torch.randint(0, 2, (3, 128)) else: input_data["labels"] = torch.randint(0, 2, (3, 1)) outputs = [] for adapter in [a1, b1]: model.active_head = adapter model.set_active_adapters(adapter) model.train_adapter(adapter) model.eval() outputs.append(model(**input_data)) model.set_active_adapters(Parallel(a2, b2)) model.train_adapter((Parallel(a2, b2))) model.eval() parallel_outputs = model(**input_data) for out1, out2 in zip(outputs, parallel_outputs.head_outputs): self.assertTrue(torch.allclose(out1["loss"], out2["loss"])) self.assertTrue(torch.allclose(out1["logits"], out2["logits"], atol=1e-5))
def test_batch_split_training(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("mrpc1") model.add_adapter("mrpc2") self.add_head(model, "mrpc1") self.add_head(model, "mrpc2") adapter_setup = BatchSplit("mrpc1", "mrpc2", batch_sizes=[1, 1]) model.active_adapters = adapter_setup model.train_adapter(adapter_setup) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc1.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.mrpc2.").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_parallel_training_equivalent_to_single_adapters(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() a1, a2 = self.create_twin_adapters(model, "a") b1, b2 = self.create_twin_adapters(model, "b") dataset = [] for i in range(3): input_data = self.get_input_samples((3, 128), config=model.config) if isinstance(model, T5ModelWithHeads): input_data["labels"] = torch.randint(0, 2, (3, 128)) else: input_data["labels"] = torch.randint(0, 2, (3, 1)) dataset.append(input_data) for adapter in [a1, b1]: model.active_head = adapter model.set_active_adapters(adapter) model.train_adapter(adapter) model.eval() model = self.train_model(model, dataset) model.set_active_adapters(Parallel(a2, b2)) model.train_adapter((Parallel(a2, b2))) model.eval() model = self.train_model(model, dataset) state_dict = model.state_dict() for k, v in state_dict.items(): if a1 in k: self.assertTrue(torch.allclose(v, state_dict[k.replace(a1, a2)], atol=1e-5)) if b1 in k: self.assertTrue(torch.allclose(v, state_dict[k.replace(b1, b2)], atol=1e-5))