def test_nested_parallel(self): self.model.set_active_adapters(Stack("a", Parallel(Stack("b", "c"), "d"))) inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) logits = self.model(**inputs).logits self.assertEqual(logits.shape, (2, 2))
def test_stacked_fusion(self): self.model.add_fusion(Fuse("b", "d")) # fuse two stacks self.model.set_active_adapters(Fuse(Stack("a", "b"), Stack("c", "d"))) self.training_pass()
def test_training_load_best_model_at_end_adapter(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") model.add_adapter("adapter") model.train_adapter("adapter") training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.001, max_steps=1, save_steps=1, remove_unused_columns=False, load_best_model_at_end=True, evaluation_strategy="epoch", save_strategy="epoch", num_train_epochs=2, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) with self.assertLogs(logger) as cm: trainer.train() self.assertTrue(any("Loading best adapter(s) from" in line for line in cm.output)) self.assertEqual(Stack("adapter"), trainer.model.active_adapters)
def test_mixed_stack(self): self.model.add_fusion(Fuse("a", "b")) self.model.set_active_adapters( Stack("a", Split("c", "d", split_index=64), Fuse("a", "b"))) self.training_pass()
def test_general(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") model.add_classification_head("task", num_labels=3) # add the adapters to be fused model.add_adapter("task") model.add_adapter("additional_adapter") model.train_adapter("task") self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters) with TemporaryDirectory() as tempdir: training_args = TrainingArguments( output_dir=tempdir, do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() # Check that adapters are actually saved but the full model is not files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))] self.assertTrue("task" in files_dir_checkpoint) self.assertTrue("additional_adapter" in files_dir_checkpoint) # Check that full model weights are not stored self.assertFalse("pytorch_model.bin" in files_dir_checkpoint) # this should always be false in the adapter trainer self.assertFalse(trainer.args.remove_unused_columns) self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters)
def test_batch_split_adapter_head(self): model = AutoModelWithHeads.from_config(self.config()) self.add_head(model, "a") self.add_head(model, "b") model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") model.set_active_adapters( BatchSplit(Stack("c", "a"), "b", batch_sizes=[2, 1])) in_data = self.get_input_samples((3, 128), config=model.config) out = model(**in_data) self.assertEqual(2, len(out)) self.assertTrue(isinstance(model.active_head, BatchSplit))
def test_stacked_split(self): # split into two stacks self.model.set_active_adapters( Split(Stack("a", "b"), Stack("c", "d"), split_index=64)) self.training_pass()
def test_to_deep(self): self.assertRaises( ValueError, lambda: parse_composition( Stack("a", Fuse("b", Stack(Fuse("c", "d"), "e")))))
def test_parse_lists(self): self.assertEqual(Stack("a"), parse_composition("a")) self.assertEqual(Stack("a", "b", "c"), parse_composition(["a", "b", "c"])) self.assertEqual(Stack("a", Fuse("b", "c")), parse_composition(["a", ["b", "c"]]))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Setup adapters if adapter_args.train_adapter: # new if data_args.madx2: # do not add adapter in the last transformer layers leave_out = [len(model.bert.encoder.layer) - 1] else: leave_out = [] # new # task_name = data_args.dataset_name or "squad" task_name = "qa" # optionally load a pretrained language adapter if adapter_args.load_lang_adapter: # resolve language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, leave_out=leave_out) # # load language adapter from Hub # lang_adapter_name = model.load_adapter( # adapter_args.load_lang_adapter, # config=lang_adapter_config, # load_as=adapter_args.language, # ) # new # load language adapter from path in load_lang_adapter task_mlm_load_as = 'mlm' lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=task_mlm_load_as, with_head=False) else: lang_adapter_name = None # check if adapter already exists otherwise add it if task_name not in model.config.adapters: # # resolve adapter config # adapter_config = AdapterConfig.load( # adapter_args.adapter_config, # non_linearity=adapter_args.adapter_non_linearity, # reduction_factor=adapter_args.adapter_reduction_factor, # ) # new # resolve adapter config with (eventually) the MAD-X 2.0 option if adapter_args.adapter_config == "pfeiffer": from transformers.adapters.configuration import PfeifferConfig adapter_config = PfeifferConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "pfeiffer+inv": from transformers.adapters.configuration import PfeifferInvConfig adapter_config = PfeifferInvConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "houlsby": from transformers.adapters.configuration import HoulsbyConfig adapter_config = HoulsbyConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "houlsby+inv": from transformers.adapters.configuration import HoulsbyInvConfig adapter_config = HoulsbyInvConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) # load adapter from hub if specified if adapter_args.load_adapter: model.load_adapter(adapter_args.load_adapter, config=adapter_config, load_as=task_name) else: model.add_adapter(task_name, config=adapter_config) # Set the adapters to be used in every forward pass if lang_adapter_name: model.active_adapters = Stack(task_mlm_load_as, task_name) else: model.set_active_adapters(task_name) # Freeze all model weights except of those in this adapter model.train_adapter(task_name) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter_training") # new # Put only the adapter after the MHA but not after the FF in the last layer if data_args.houlsby_MHA_lastlayer and adapter_args.train_adapter and not data_args.madx2 and task_name in model.config.adapters and ( adapter_args.adapter_config == "houlsby" or adapter_args.adapter_config == "houlsby+inv"): import torch from torch.nn import ModuleDict model.bert.encoder.layer[len(model.bert.encoder.layer) - 1].output.adapters = ModuleDict() # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names else: column_names = datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = datasets["validation"] if data_args.max_val_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select( range(data_args.max_val_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_val_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_examples = datasets["test"] if data_args.max_test_samples is not None: # We will select sample from whole data test_examples = test_examples.select( range(data_args.max_test_samples)) # Test Feature Creation test_dataset = test_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_test_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again test_dataset = test_dataset.select( range(data_args.max_test_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): results = metric.compute(predictions=p.predictions, references=p.label_ids) # new (change the metric name f1 to eval_f1 in order to avoid a bug at the evaluation time results['eval_f1'] = results['f1'] results.pop('f1') return results # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, callbacks=[ EarlyStoppingCallback( early_stopping_patience=data_args.early_stopping_patience) ] if training_args.load_best_model_at_end else None, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) return metrics # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(test_dataset, test_examples) metrics = results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len( test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics)
def test_nested_batch_split(self): self.model.set_active_adapters(Stack("a", BatchSplit("b", "c", batch_sizes=[2, 2]))) self.batched_training_pass()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Setup adapters if adapter_args.train_adapter: # new if data_args.madx2: # do not add adapter in the last transformer layers leave_out = [len(model.bert.encoder.layer) - 1] else: leave_out = [] # new # task_name = data_args.dataset_name or "squad" task_name = "ner" # optionally load a pretrained language adapter if adapter_args.load_lang_adapter: # resolve language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, leave_out=leave_out) # # load language adapter from Hub # lang_adapter_name = model.load_adapter( # adapter_args.load_lang_adapter, # config=lang_adapter_config, # load_as=adapter_args.language, # ) # new # load language adapter from path in load_lang_adapter task_mlm_load_as = 'mlm' lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=task_mlm_load_as, with_head=False) else: lang_adapter_name = None # check if adapter already exists otherwise add it if task_name not in model.config.adapters: # # resolve adapter config # adapter_config = AdapterConfig.load( # adapter_args.adapter_config, # non_linearity=adapter_args.adapter_non_linearity, # reduction_factor=adapter_args.adapter_reduction_factor, # ) # new # resolve adapter config with (eventually) the MAD-X 2.0 option adapter_config_name = adapter_args.adapter_config if adapter_args.adapter_config == "pfeiffer": from transformers.adapters.configuration import PfeifferConfig adapter_config = PfeifferConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "pfeiffer+inv": from transformers.adapters.configuration import PfeifferInvConfig adapter_config = PfeifferInvConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "houlsby": from transformers.adapters.configuration import HoulsbyConfig adapter_config = HoulsbyConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) elif adapter_args.adapter_config == "houlsby+inv": from transformers.adapters.configuration import HoulsbyInvConfig adapter_config = HoulsbyInvConfig( non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, leave_out=leave_out) # load adapter from hub if specified if adapter_args.load_adapter: model.load_adapter(adapter_args.load_adapter, config=adapter_config, load_as=task_name) else: model.add_adapter(task_name, config=adapter_config) # Set the adapters to be used in every forward pass if lang_adapter_name: model.active_adapters = Stack(task_mlm_load_as, task_name) else: model.set_active_adapters(task_name) # Freeze all model weights except of those in this adapter model.train_adapter(task_name) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter_training") # new # Put only the adapter after the MHA but not after the FeedForward in the last layer if data_args.houlsby_MHA_lastlayer and adapter_args.train_adapter and not data_args.madx2 and task_name in model.config.adapters and ( adapter_config_name == "houlsby" or adapter_config_name == "houlsby+inv"): import torch from torch.nn import ModuleDict model.bert.encoder.layer[len(model.bert.encoder.layer) - 1].output.adapters = ModuleDict() # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: # new (eval_ as prefix) return { "eval_precision": results["overall_precision"], "eval_recall": results["overall_recall"], "eval_f1": results["overall_f1"], "eval_accuracy": results["overall_accuracy"], } # new (callbacks) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, callbacks=[ EarlyStoppingCallback( early_stopping_patience=data_args.early_stopping_patience) ] if training_args.load_best_model_at_end else None, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n")