def run(model, description): run_id = str(uuid4()) model.run(run_id) K.clear_session() model_instance = evaluate.load(os.path.join( config.MODEL_DIR, "{}-{}.h5".format(run_id, model.MODEL_NAME), )) train, validation, test_data = data() train_data_stats = characterize_data(train) validation_data_stats = characterize_data(validation) test_data_stats = characterize_data(test_data) results = test(model_instance, train, validation, test_data) result = Result( model.MODEL_NAME, run_id, train_data_stats, validation_data_stats, test_data_stats, description, **results ) db.session.add(result) db.session.commit()
def predict(image): ''' Write your code for prediction here. ''' import evaluate answer = 'xyzabc' # sample needs to be modified img = image model = evaluate.load('./merge4.h5') test_data, test_labels = evaluate.load_data() test = test_data #train_l=train_labels test_l = test_labels test_data, test_labels = evaluate.preprocessing(test_data, test_labels) prediction = evaluate.predictor(model, test_data, 250) accuracy = evaluate.test_accuracy(prediction, test_l) best_acc, character = evaluate.best_accuracy(prediction, test_l) print(best_acc) thresh, im = evaluate.thresholding(image) gsblur = evaluate.cleaning(thresh) answer = evaluate.extract(model, gsblur, im, character) print(answer) return answer
def run(model, description, input_form, label_form="shrunk", split_id=None, loaded_data=None, hyperparameters=dict()): run_id = uuid4() if split_id is None: split_id = run_id history = model.run(run_id, input_form=input_form, loaded_data=loaded_data, label_form=label_form, hyperparameters=hyperparameters) K.clear_session() model_instance = evaluate.load( os.path.join( config.MODEL_DIR, "{}-{}.h5".format(str(run_id), model.MODEL_NAME), )) if loaded_data is None: train, validation, test = data(split_id, input_form=input_form, label_form=label_form, balanced_train=True) else: train, validation, test = loaded_data train.reset() validation.reset() test.reset() train_data_stats = characterize_data(train) validation_data_stats = characterize_data(validation) test_data_stats = characterize_data(test) results = test_model(model_instance, train, validation, test) training.reset() validation.reset() test.reset() result = Result(model.MODEL_NAME, str(run_id), str(split_id), train_data_stats, validation_data_stats, test_data_stats, description, input_form, label=label_form, hyperparameters=hyperparameters, history=history, **results) db.session.add(result) db.session.commit()
def extract_tuples_with_simplification(source_path, save_path): documents = load(source_path) documents_ = [] with open(save_path, "w", encoding="utf-8") as fo: for document in documents: doc = [sent for sent, _ in document] sentence_tuples = defaultdict(list) for tuple_ in process_document(doc, True): sentence_tuples[tuple_.predicate.sentence].append(tuple_) document_ = [] for sent in doc: tuples = sentence_tuples.get(sent) if tuples is None: tuples = [] document_.append((sent, tuples)) documents_.append(document_) with open(save_path, "wb") as fo: pickle.dump(documents_, fo)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_summarization", model_args, data_args, framework="tensorflow") # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity(logging.INFO) transformers.utils.logging.set_verbosity(logging.INFO) # Log on each process the small summary: logger.info(f"Training/evaluation parameters {training_args}") # endregion # region T5 special-casing if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `" ) # endregion # region Detecting last checkpoint last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load model config and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # endregion # region Dataset preprocessing # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, and/or `do_eval`.") return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) else: train_dataset = None if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) else: eval_dataset = None # endregion # region Text preprocessing def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels # endregion with training_args.strategy.scope(): # region Prepare model model = TFAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # endregion # region Prepare TF Dataset objects if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas tf_train_dataset = dataset_to_tf( train_dataset, model, tokenizer, total_batch_size=total_train_batch_size, num_epochs=training_args.num_train_epochs, shuffle=True, ) tf_eval_dataset = dataset_to_tf( eval_dataset, model, tokenizer, total_eval_batch_size, num_epochs=1, shuffle=False, ) # endregion # region Optimizer, loss and LR scheduling # Scheduler and math around the number of training steps. num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0 ) def masked_sparse_categorical_crossentropy(y_true, y_pred): # We clip the negative labels to 0 to avoid NaNs appearing in the output and # fouling up everything that comes afterwards. The loss values corresponding to clipped values # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason. # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely # event that you have more than 1 million tokens in your vocabulary, consider increasing this value. # More pragmatically, consider redesigning your tokenizer. losses = tf.keras.losses.sparse_categorical_crossentropy( tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True ) # Compute the per-sample loss only over the unmasked tokens losses = tf.ragged.boolean_mask(losses, y_true != -100) losses = tf.reduce_mean(losses, axis=-1) return losses # endregion # region Metric metric = evaluate.load("rouge") # endregion # region Training model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer) if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") model.fit( tf_train_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=num_update_steps_per_epoch, ) # endregion # region Validation if data_args.val_max_target_length is None: data_args.val_max_target_length = data_args.max_target_length gen_kwargs = { "max_length": data_args.val_max_target_length if data_args is not None else config.max_length, "num_beams": data_args.num_beams, } if training_args.do_eval: logger.info("Evaluation...") for batch, labels in tqdm( tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size ): batch.update(gen_kwargs) generated_tokens = model.generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) result = metric.compute(use_stemmer=True) result = {k: round(v * 100, 4) for k, v in result.items()} logger.info(result) # endregion if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir)
def xrun(fold, loaded_data, model, description, input_form, label_form="outcome", split_id=None, hyperparameters=dict()): # gen random id for this specific parameter run run_id = uuid4() if split_id is None: split_id = run_id # calling run function on the cross validation v1 model history = model.run(run_id, mode='cross', loaded_data=loaded_data, input_form=input_form, label_form=label_form, hyperparameters=hyperparameters) # clearing the training session because we will be running multiple models K.clear_session() # load the model instance - this takes awhile model_instance = evaluate.load( os.path.join( config.MODEL_DIR, "{}-{}.h5".format(str(run_id), model.MODEL_NAME), )) # fold_train, fold_validation, fold_test, fold_holdout_test = loaded_data fold_train, fold_validation, fold_test = loaded_data fold_train.reset() fold_validation.reset() fold_test.reset() # fold_holdout_test.reset() train_data_stats = characterize_data(fold_train) validation_data_stats = characterize_data(fold_validation) test_data_stats = characterize_data(fold_test) # holdout_test_data_stats = characterize_data(fold_holdout_test) # testing of the data # results = test_model(model_instance, fold_train, fold_validation, fold_test, fold_holdout_test) results = test_model(model_instance, fold_train, fold_validation, fold_test) fold_train.reset() fold_validation.reset() fold_test.reset() # fold_holdout_test.reset() holdout_test_data_stats = 'na' result = XResult(fold, config.NUMBER_OF_FOLDS, model.MODEL_NAME, str(run_id), str(split_id), train_data_stats, validation_data_stats, test_data_stats, holdout_test_data_stats, description, input_form, label_form, hyperparameters=hyperparameters, history=history, **results) db.session.add(result) db.session.commit()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_seq2seq_qa", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # Preprocessing the datasets. # We need to generate and tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # Get the column names for input/target. dataset_columns = question_answering_column_name_mapping.get( data_args.dataset_name, None) if data_args.question_column is None: question_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: question_column = data_args.question_column if question_column not in column_names: raise ValueError( f"--question_column' value '{data_args.question_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.context_column is None: context_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: context_column = data_args.context_column if context_column not in column_names: raise ValueError( f"--context_column' value '{data_args.context_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.answer_column is None: answer_column = dataset_columns[ 2] if dataset_columns is not None else column_names[2] else: answer_column = data_args.answer_column if answer_column not in column_names: raise ValueError( f"--answer_column' value '{data_args.answer_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_answer_length for training. max_answer_length = data_args.max_answer_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_squad_batch( examples, question_column: str, context_column: str, answer_column: str, ) -> Tuple[List[str], List[str]]: questions = examples[question_column] contexts = examples[context_column] answers = examples[answer_column] def generate_input(_question, _context): return " ".join([ "question:", _question.lstrip(), "context:", _context.lstrip() ]) inputs = [ generate_input(question, context) for question, context in zip(questions, contexts) ] targets = [ answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers ] return inputs, targets def preprocess_function(examples): inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) # Tokenize targets with text_target=... labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs # Validation preprocessing def preprocess_validation_function(examples): inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) model_inputs = tokenizer( inputs, max_length=max_seq_length, padding=padding, truncation=True, return_overflowing_tokens=True, return_offsets_mapping=True, ) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = model_inputs.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. model_inputs["example_id"] = [] for i in range(len(model_inputs["input_ids"])): # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] model_inputs["example_id"].append(examples["id"][sample_index]) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( preprocess_validation_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(data_args.max_predict_samples)) # Predict Feature Creation with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_examples.map( preprocess_validation_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) metric = evaluate.load( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Post-processing: def post_processing_function(examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"): # Decode the predicted tokens. preds = outputs.predictions if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features. example_id_to_index = {k: i for i, k in enumerate(examples["id"])} feature_per_example = { example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features) } predictions = {} # Let's loop over all the examples! for example_index, example in enumerate(examples): # This is the index of the feature associated to the current example. feature_index = feature_per_example[example_index] predictions[example["id"]] = decoded_preds[feature_index] # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Initialize our Trainer trainer = QuestionAnsweringSeq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, post_process_function=post_processing_function, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} max_length = (training_args.generation_max_length if training_args.generation_max_length is not None else data_args.val_max_answer_length) num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "question-answering" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def main(): # Parse the arguments args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_translation_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = ( Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( args.target_lang is not None and args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if args.source_lang is not None: tokenizer.src_lang = args.source_lang if args.target_lang is not None: tokenizer.tgt_lang = args.target_lang # Get the language codes for input/target. source_lang = args.source_lang.split("_")[0] target_lang = args.target_lang.split("_")[0] padding = "max_length" if args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value accelerator.init_trackers("translation_no_trainer", experiment_config) metric = evaluate.load("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) # update the progress_bar if load from checkpoint progress_bar.update(starting_epoch * num_update_steps_per_epoch) completed_steps = starting_epoch * num_update_steps_per_epoch for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: if step % args.gradient_accumulation_steps == 0: progress_bar.update(1) completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes(batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: decoded_preds = decoded_preds[: len(eval_dataloader.dataset) - samples_seen] decoded_labels = decoded_labels[: len(eval_dataloader.dataset) - samples_seen] else: samples_seen += len(decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]}) if args.with_tracking: accelerator.log( { "bleu": eval_metric["score"], "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"eval_bleu": eval_metric["score"]}, f)
def main(): # region Argument Parsing parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args, framework="tensorflow") # endregion # region Setup logging # we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # endregion # region Loading datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # endregion # region Load config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) # endregion # region Preprocessing the raw datasets # First we tokenize all the texts. padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=data_args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Initialize model if model_args.model_name_or_path: model = TFAutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, ) else: logger.info("Training new model from scratch") model = TFAutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region Create TF datasets # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as # well as inputs. collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=collate_fn, batch_size=total_train_batch_size, shuffle=True, ).with_options(dataset_options) total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, collate_fn=collate_fn, batch_size=total_eval_batch_size, shuffle=False, ).with_options(dataset_options) # endregion # region Optimizer, loss and compilation num_train_steps = int( len(tf_train_dataset) * training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) model.compile(optimizer=optimizer, jit_compile=training_args.xla) # endregion # Metrics metric = evaluate.load("seqeval") def get_labels(y_pred, y_true): # Transform predictions and references tensos to numpy arrays # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: if data_args.dataset_name is not None: push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" else: push_to_hub_model_id = f"{model_name}-finetuned-token-classification" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "token-classification" } if data_args.dataset_name is not None: model_card_kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: model_card_kwargs[ "dataset_args"] = data_args.dataset_config_name model_card_kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: model_card_kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info(f" Total train batch size = {total_train_batch_size}") # Only show the progress bar once on each machine. model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Predictions # If you have variable batch sizes (i.e. not using pad_to_max_length), then # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq # length from predict(). try: predictions = model.predict( tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"] except tf.python.framework.errors_impl.InvalidArgumentError: raise ValueError( "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older " "then you will need to use --pad_to_max_length to generate predictions, as older " "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor." ) if isinstance(predictions, tf.RaggedTensor): predictions = predictions.to_tensor(default_value=-100) predictions = tf.math.argmax(predictions, axis=-1).numpy() if "label" in eval_dataset: labels = eval_dataset.with_format("tf")["label"] else: labels = eval_dataset.with_format("tf")["labels"] if isinstance(labels, tf.RaggedTensor): labels = labels.to_tensor(default_value=-100) labels = labels.numpy() attention_mask = eval_dataset.with_format("tf")["attention_mask"] if isinstance(attention_mask, tf.RaggedTensor): attention_mask = attention_mask.to_tensor(default_value=-100) attention_mask = attention_mask.numpy() labels[attention_mask == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch( predictions=preds, references=refs, ) eval_metric = compute_metrics() logger.info("Evaluation metrics:") for key, val in eval_metric.items(): logger.info(f"{key}: {val:.4f}") if training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metric)) # endregion if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_qa_beam_search", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = XLNetTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = XLNetForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: column_names = raw_datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # Select samples from Dataset, This will help to decrease processing time max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Create Training Features with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # Selecting Eval Samples from Dataset max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) eval_examples = eval_examples.select(range(max_eval_samples)) # Create Features from Eval Dataset with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(data_args.max_predict_samples)) # Test Feature Creation with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=training_args.output_dir, log_level=log_level, prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "question-answering" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere. # Otherwise, we have to get the list of labels manually. labels_are_int = isinstance(features[label_column_name].feature, ClassLabel) if labels_are_int: label_list = features[label_column_name].feature.names label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, add_prefix_space=True, ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" " this requirement") # Model has labels -> use them. if model.config.label2id != PretrainedConfig( num_labels=num_labels).label2id: if list(sorted(model.config.label2id.keys())) == list( sorted(label_list)): # Reorganize `label_list` to match the ordering of the model. if labels_are_int: label_to_id = { i: int(model.config.label2id[l]) for i, l in enumerate(label_list) } label_list = [ model.config.id2label[i] for i in range(num_labels) ] else: label_list = [ model.config.id2label[i] for i in range(num_labels) ] label_to_id = {l: i for i, l in enumerate(label_list)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:" f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.", ) # Set the correspondences label/ID inside the model config model.config.label2id = {l: i for i, l in enumerate(label_list)} model.config.id2label = {i: l for i, l in enumerate(label_list)} # Map that sends B-Xxx label to its I-Xxx counterpart b_to_i_label = [] for idx, label in enumerate(label_list): if label.startswith("B-") and label.replace("B-", "I-") in label_list: b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) else: b_to_i_label.append(idx) # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: if data_args.label_all_tokens: label_ids.append( b_to_i_label[label_to_id[label[word_idx]]]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Data collator data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = evaluate.load("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict( predict_dataset, metric_key_prefix="predict") predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Save predictions output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "token-classification" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." " Make sure to set `--audio_column_name` to the correct audio column - one of" f" {', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) text_column_name = data_args.text_column_name def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " else: batch["target_text"] = batch[text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): try: os.remove(vocab_file) except OSError: # in shared file-systems it might be the case that # two processes try to delete the vocab file at the some time pass with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = { metric: evaluate.load(metric) for metric in data_args.eval_metrics } # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) try: processor = AutoProcessor.from_pretrained(training_args.output_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": (f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" f" {data_args.eval_split_name}"), "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_summarization", model_args, data_args, framework="flax") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Setup logging, we only want one process per machine to log things on the screen. logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR) if jax.process_index() == 0: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name(Path( training_args.output_dir).absolute().name, token=training_args.hub_token) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] dataset = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.config_name: config = AutoConfig.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = FlaxAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) else: model = FlaxAutoModelForSeq2SeqLM.from_config( config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), ) if training_args.gradient_checkpointing: model.enable_gradient_checkpointing() if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = dataset["train"].column_names elif training_args.do_eval: column_names = dataset["validation"].column_names elif training_args.do_predict: column_names = dataset["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length # In Flax, for seq2seq models we need to pass `decoder_input_ids` # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here # for that dynamically import the `shift_tokens_right` function from the model file model_module = __import__(model.__module__, fromlist=["shift_tokens_tight"]) shift_tokens_right_fn = getattr(model_module, "shift_tokens_right") # Setting padding="max_length" as we need fixed length inputs for jitted functions def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding="max_length", truncation=True, return_tensors="np") # Setup the tokenizer for targets labels = tokenizer( text_target=targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np", ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn( labels["input_ids"], config.pad_token_id, config.decoder_start_token_id) model_inputs["decoder_input_ids"] = np.asarray(decoder_input_ids) # We need decoder_attention_mask so we can ignore pad tokens from loss model_inputs["decoder_attention_mask"] = labels["attention_mask"] return model_inputs if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in dataset: raise ValueError("--do_eval requires a validation dataset") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in dataset: raise ValueError("--do_predict requires a test dataset") predict_dataset = dataset["test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Metric metric = evaluate.load("rouge") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(preds, labels): decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) result = {k: round(v * 100, 4) for k, v in result.items()} prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) return result # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter( log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable.") # Initialize our training rng = jax.random.PRNGKey(training_args.seed) rng, dropout_rng = jax.random.split(rng) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int( training_args.per_device_train_batch_size) * jax.device_count() per_device_eval_batch_size = int(training_args.per_device_eval_batch_size) eval_batch_size = per_device_eval_batch_size * jax.device_count() steps_per_epoch = len(train_dataset) // train_batch_size total_train_steps = steps_per_epoch * num_epochs # Create learning rate schedule linear_decay_lr_schedule_fn = create_learning_rate_fn( len(train_dataset), train_batch_size, training_args.num_train_epochs, training_args.warmup_steps, training_args.learning_rate, ) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) # find out all LayerNorm parameters layer_norm_candidates = ["layernorm", "layer_norm", "ln"] layer_norm_named_params = set([ layer[-2:] for layer_norm_name in layer_norm_candidates for layer in flat_params.keys() if layer_norm_name in "".join(layer).lower() ]) flat_mask = { path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params } return traverse_util.unflatten_dict(flat_mask) # create adam optimizer adamw = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, eps=training_args.adam_epsilon, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng) # label smoothed cross entropy def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0): """ The label smoothing implementation is adapted from Flax's official example: https://github.com/google/flax/blob/87a211135c6a377c8f29048a1cac3840e38b9da4/examples/wmt/train.py#L104 """ vocab_size = logits.shape[-1] confidence = 1.0 - label_smoothing_factor low_confidence = (1.0 - confidence) / (vocab_size - 1) normalizing_constant = -(confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20)) soft_labels = onehot(labels, vocab_size, on_value=confidence, off_value=low_confidence) loss = optax.softmax_cross_entropy(logits, soft_labels) loss = loss - normalizing_constant # ignore padded tokens from loss loss = loss * padding_mask loss = loss.sum() / padding_mask.sum() return loss # Define gradient update step fn def train_step(state, batch, label_smoothing_factor=0.0): dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng) def compute_loss(params): labels = batch.pop("labels") logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor) return loss grad_fn = jax.value_and_grad(compute_loss) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng) metrics = { "loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step) } metrics = jax.lax.pmean(metrics, axis_name="batch") return new_state, metrics # Define eval fn def eval_step(params, batch, label_smoothing_factor=0.0): labels = batch.pop("labels") logits = model(**batch, params=params, train=False)[0] loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor) # summarize metrics metrics = {"loss": loss} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics # Define generation function max_length = (data_args.val_max_target_length if data_args.val_max_target_length is not None else model.config.max_length) num_beams = data_args.num_beams if data_args.num_beams is not None else model.config.num_beams gen_kwargs = {"max_length": max_length, "num_beams": num_beams} def generate_step(params, batch): model.params = params output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs) return output_ids.sequences # Create parallel version of the train and eval step p_train_step = jax.pmap(partial( train_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch", donate_argnums=(0, )) p_eval_step = jax.pmap( partial(eval_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch") p_generate_step = jax.pmap(generate_step, "batch") # Replicate the train state on each device state = state.replicate() logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {num_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel & distributed) = {train_batch_size}" ) logger.info(f" Total optimization steps = {total_train_steps}") train_time = 0 epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() # Create sampling rng rng, input_rng = jax.random.split(rng) train_metrics = [] # Generate an epoch by shuffling sampling indices from the train dataset train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True) steps_per_epoch = len(train_dataset) // train_batch_size # train for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False): batch = next(train_loader) batch = shard(batch) state, train_metric = p_train_step(state, batch) train_metrics.append(train_metric) train_time += time.time() - train_start train_metric = unreplicate(train_metric) epochs.write( f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate:" f" {train_metric['learning_rate']})") # ======================== Evaluating ============================== eval_metrics = [] eval_preds = [] eval_labels = [] eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size, drop_last=False) eval_steps = math.ceil(len(eval_dataset) / eval_batch_size) for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False): # Model forward batch = next(eval_loader) labels = batch["labels"] metrics = pad_shard_unpad(p_eval_step, static_return=True)( state.params, batch, min_device_batch=per_device_eval_batch_size) eval_metrics.append(metrics) # generation if data_args.predict_with_generate: generated_ids = pad_shard_unpad(p_generate_step)(state.params, batch) eval_preds.extend( jax.device_get( generated_ids.reshape(-1, gen_kwargs["max_length"]))) eval_labels.extend(labels) # normalize eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # compute ROUGE metrics rouge_desc = "" if data_args.predict_with_generate: rouge_metrics = compute_metrics(eval_preds, eval_labels) eval_metrics.update(rouge_metrics) rouge_desc = " ".join([ f"Eval {key}: {value} |" for key, value in rouge_metrics.items() ]) # Print metrics and update progress bar desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {rouge_desc})" epochs.write(desc) epochs.desc = desc # Save metrics if has_tensorboard and jax.process_index() == 0: cur_step = epoch * (len(train_dataset) // train_batch_size) write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step) # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub( commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False) # ======================== Prediction loop ============================== if training_args.do_predict: logger.info("*** Predict ***") pred_metrics = [] pred_generations = [] pred_labels = [] pred_loader = data_loader(input_rng, predict_dataset, eval_batch_size, drop_last=False) pred_steps = math.ceil(len(predict_dataset) / eval_batch_size) for _ in tqdm(range(pred_steps), desc="Predicting...", position=2, leave=False): # Model forward batch = next(pred_loader) labels = batch["labels"] metrics = pad_shard_unpad(p_eval_step, static_return=True)( state.params, batch, min_device_batch=per_device_eval_batch_size) pred_metrics.append(metrics) # generation if data_args.predict_with_generate: generated_ids = pad_shard_unpad(p_generate_step)(state.params, batch) pred_generations.extend( jax.device_get( generated_ids.reshape(-1, gen_kwargs["max_length"]))) pred_labels.extend(labels) # normalize prediction metrics pred_metrics = get_metrics(pred_metrics) pred_metrics = jax.tree_map(jnp.mean, pred_metrics) # compute ROUGE metrics rouge_desc = "" if data_args.predict_with_generate: rouge_metrics = compute_metrics(pred_generations, pred_labels) pred_metrics.update(rouge_metrics) rouge_desc = " ".join([ f"Predict {key}: {value} |" for key, value in rouge_metrics.items() ]) # Print metrics desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})" logger.info(desc) # save final metrics in json if jax.process_index() == 0: rouge_metrics = { f"test_{metric_name}": value for metric_name, value in rouge_metrics.items() } path = os.path.join(training_args.output_dir, "test_results.json") with open(path, "w") as f: json.dump(rouge_metrics, f, indent=4, sort_keys=True)
def main(): # region Argument Parsing parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args, framework="tensorflow") # endregion # region Setup logging # we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # endregion # region Loading datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # endregion # region Load config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) # endregion # region Preprocessing the raw datasets # First we tokenize all the texts. padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=data_args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Initialize model if model_args.model_name_or_path: model = TFAutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, ) else: logger.info("Training new model from scratch") model = TFAutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region Create TF datasets num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas train_batches_per_epoch = len(train_dataset) // total_train_batch_size tf_train_dataset = dataset_to_tf( train_dataset, tokenizer, total_batch_size=total_train_batch_size, num_epochs=training_args.num_train_epochs, shuffle=True, ) total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size tf_eval_dataset = dataset_to_tf( eval_dataset, tokenizer, total_batch_size=total_eval_batch_size, num_epochs=training_args.num_train_epochs, shuffle=False, ) # endregion # region Optimizer, loss and compilation optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch), num_warmup_steps=training_args.warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, ) def dummy_loss(y_true, y_pred): return tf.reduce_mean(y_pred) model.compile(loss={"loss": dummy_loss}, optimizer=optimizer) # endregion # Metrics metric = evaluate.load("seqeval") def get_labels(y_pred, y_true): # Transform predictions and references tensos to numpy arrays # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # endregion # region Training logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info(f" Total train batch size = {total_train_batch_size}") # Only show the progress bar once on each machine. model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=train_batches_per_epoch, validation_steps=eval_batches_per_epoch, ) # endregion # region Predictions # For predictions, we preload the entire validation set - note that if you have a really giant validation # set, you might need to change this! eval_inputs = { key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features } predictions = model.predict( eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"] predictions = tf.math.argmax(predictions, axis=-1) labels = np.array(eval_inputs["labels"]) labels[np.array(eval_inputs["attention_mask"]) == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch( predictions=preds, references=refs, ) eval_metric = compute_metrics() logger.info("Evaluation metrics:") for key, val in eval_metric.items(): logger.info(f"{key}: {val:.4f}") # endregion # We don't do predictions in the strategy scope because there are some issues in there right now. # They'll get fixed eventually, promise! if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args, framework="flax") # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Setup logging, we only want one process per machine to log things on the screen. logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR) if jax.process_index() == 0: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name(Path( training_args.output_dir).absolute().name, token=training_args.hub_token) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading the dataset from local csv or json file. data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, label2id=label_to_id, id2label={i: l for l, i in label_to_id.items()}, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, add_prefix_space=True, ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = FlaxAutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=data_args.max_seq_length, padding="max_length", truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Define a summary writer has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter(training_args.output_dir) summary_writer.hparams({ **training_args.to_dict(), **vars(model_args), **vars(data_args) }) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable.") def write_train_metric(summary_writer, train_metrics, train_time, step): summary_writer.scalar("train_time", train_time, step) train_metrics = get_metrics(train_metrics) for key, vals in train_metrics.items(): tag = f"train_{key}" for i, val in enumerate(vals): summary_writer.scalar(tag, val, step - len(vals) + i + 1) def write_eval_metric(summary_writer, eval_metrics, step): for metric_name, value in eval_metrics.items(): summary_writer.scalar(f"eval_{metric_name}", value, step) num_epochs = int(training_args.num_train_epochs) rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count( ) per_device_eval_batch_size = int(training_args.per_device_eval_batch_size) eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count( ) learning_rate_fn = create_learning_rate_fn( len(train_dataset), train_batch_size, training_args.num_train_epochs, training_args.warmup_steps, training_args.learning_rate, ) state = create_train_state(model, learning_rate_fn, num_labels=num_labels, training_args=training_args) # define step functions def train_step( state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey) -> Tuple[train_state.TrainState, float]: """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`.""" dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) targets = batch.pop("labels") def loss_fn(params): logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] loss = state.loss_fn(logits, targets) return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean( { "loss": loss, "learning_rate": learning_rate_fn(state.step) }, axis_name="batch") return new_state, metrics, new_dropout_rng p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0, )) def eval_step(state, batch): logits = state.apply_fn(**batch, params=state.params, train=False)[0] return state.logits_fn(logits) p_eval_step = jax.pmap(eval_step, axis_name="batch") metric = evaluate.load("seqeval") def get_labels(y_pred, y_true): # Transform predictions and references tensos to numpy arrays # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } logger.info(f"===== Starting training ({num_epochs} epochs) =====") train_time = 0 # make sure weights are replicated on each device state = replicate(state) train_time = 0 step_per_epoch = len(train_dataset) // train_batch_size total_steps = step_per_epoch * num_epochs epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # train for step, batch in enumerate( tqdm( train_data_collator(input_rng, train_dataset, train_batch_size), total=step_per_epoch, desc="Training...", position=1, )): state, train_metric, dropout_rngs = p_train_step( state, batch, dropout_rngs) train_metrics.append(train_metric) cur_step = (epoch * step_per_epoch) + (step + 1) if cur_step % training_args.logging_steps == 0 and cur_step > 0: # Save metrics train_metric = unreplicate(train_metric) train_time += time.time() - train_start if has_tensorboard and jax.process_index() == 0: write_train_metric(summary_writer, train_metrics, train_time, cur_step) epochs.write( f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate:" f" {train_metric['learning_rate']})") train_metrics = [] if cur_step % training_args.eval_steps == 0 and cur_step > 0: eval_metrics = {} # evaluate for batch in tqdm( eval_data_collator(eval_dataset, eval_batch_size), total=math.ceil(len(eval_dataset) / eval_batch_size), desc="Evaluating ...", position=2, ): labels = batch.pop("labels") predictions = pad_shard_unpad(p_eval_step)( state, batch, min_device_batch=per_device_eval_batch_size) predictions = np.array(predictions) labels[np.array(chain( *batch["attention_mask"])) == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch( predictions=preds, references=refs, ) eval_metrics = compute_metrics() if data_args.return_entity_level_metrics: logger.info( f"Step... ({cur_step}/{total_steps} | Validation metrics: {eval_metrics}" ) else: logger.info( f"Step... ({cur_step}/{total_steps} | Validation f1: {eval_metrics['f1']}, Validation Acc:" f" {eval_metrics['accuracy']})") if has_tensorboard and jax.process_index() == 0: write_eval_metric(summary_writer, eval_metrics, cur_step) if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps): # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(unreplicate(state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub( commit_message= f"Saving weights and logs of step {cur_step}", blocking=False) epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}" # Eval after training if training_args.do_eval: eval_metrics = {} eval_loader = eval_data_collator(eval_dataset, eval_batch_size) for batch in tqdm(eval_loader, total=len(eval_dataset) // eval_batch_size, desc="Evaluating ...", position=2): labels = batch.pop("labels") predictions = pad_shard_unpad(p_eval_step)( state, batch, min_device_batch=per_device_eval_batch_size) predictions = np.array(predictions) labels[np.array(chain(*batch["attention_mask"])) == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch(predictions=preds, references=refs) eval_metrics = compute_metrics() if jax.process_index() == 0: eval_metrics = { f"eval_{metric_name}": value for metric_name, value in eval_metrics.items() } path = os.path.join(training_args.output_dir, "eval_results.json") with open(path, "w") as f: json.dump(eval_metrics, f, indent=4, sort_keys=True)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_translation", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with " "`--source_prefix 'translate English to German: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): assert data_args.target_lang is not None and data_args.source_lang is not None, ( f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " "--target_lang arguments.") tokenizer.src_lang = data_args.source_lang tokenizer.tgt_lang = data_args.target_lang # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. forced_bos_token_id = ( tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None) model.config.forced_bos_token_id = forced_bos_token_id # Get the language codes for input/target. source_lang = data_args.source_lang.split("_")[0] target_lang = data_args.target_lang.split("_")[0] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select( range(max_predict_samples)) with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = evaluate.load("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} max_length = (training_args.generation_max_length if training_args.generation_max_length is not None else data_args.val_max_target_length) num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams) metrics = predict_results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: predictions = tokenizer.batch_decode( predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join( training_args.output_dir, "generated_predictions.txt") with open(output_prediction_file, "w", encoding="utf-8") as writer: writer.write("\n".join(predictions)) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "translation" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name languages = [ l for l in [data_args.source_lang, data_args.target_lang] if l is not None ] if len(languages) > 0: kwargs["language"] = languages if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_clm", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file.split(".")[-1] if data_args.train_file is not None else data_args.validation_file.split(".")[-1]) if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: model = AutoModelForCausalLM.from_config(config) n_params = sum( dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) logger.info( f"Training new model from scratch - Total size={n_params/2**20:.2f}M params" ) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function tok_logger = transformers.utils.logging.get_logger( "transformers.tokenization_utils_base") def tokenize_function(examples): with CaptureLogger(tok_logger) as cl: output = tokenizer(examples[text_column_name]) # clm input could be much much longer than block_size if "Token indices sequence length is longer than the" in cl.out: tok_logger.warning( "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" " before being passed to the model.") return output with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): # Depending on the model and config, logits may contain extra tensors, # like past_key_values, but logits always come first logits = logits[0] return logits.argmax(dim=-1) metric = evaluate.load("accuracy") def compute_metrics(eval_preds): preds, labels = eval_preds # preds have the same shape as the labels, after the argmax(-1) has been calculated # by preprocess_logits_for_metrics but we need to shift the labels labels = labels[:, 1:].reshape(-1) preds = preds[:, :-1].reshape(-1) return metric.compute(predictions=preds, references=labels) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "text-generation" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_image_classification", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Initialize our dataset and prepare it for the 'image-classification' task. if data_args.dataset_name is not None: dataset = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, task="image-classification", use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_dir is not None: data_files["train"] = os.path.join(data_args.train_dir, "**") if data_args.validation_dir is not None: data_files["validation"] = os.path.join(data_args.validation_dir, "**") dataset = load_dataset( "imagefolder", data_files=data_files, cache_dir=model_args.cache_dir, task="image-classification", ) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = dataset["train"].train_test_split(data_args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. labels = dataset["train"].features["labels"].names label2id, id2label = dict(), dict() for i, label in enumerate(labels): label2id[label] = str(i) id2label[str(i)] = label # Load the accuracy metric from the datasets package metric = evaluate.load("accuracy") # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p): """Computes accuracy on a batch of predictions""" return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids) config = AutoConfig.from_pretrained( model_args.config_name or model_args.model_name_or_path, num_labels=len(labels), label2id=label2id, id2label=id2label, finetuning_task="image-classification", cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForImageClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Define torchvision transforms to be applied to each image. normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) _train_transforms = Compose([ RandomResizedCrop(feature_extractor.size), RandomHorizontalFlip(), ToTensor(), normalize, ]) _val_transforms = Compose([ Resize(feature_extractor.size), CenterCrop(feature_extractor.size), ToTensor(), normalize, ]) def train_transforms(example_batch): """Apply _train_transforms across a batch.""" example_batch["pixel_values"] = [ _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"] ] return example_batch def val_transforms(example_batch): """Apply _val_transforms across a batch.""" example_batch["pixel_values"] = [ _val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"] ] return example_batch if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: dataset["train"] = (dataset["train"].shuffle( seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms dataset["train"].set_transform(train_transforms) if training_args.do_eval: if "validation" not in dataset: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: dataset["validation"] = (dataset["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms dataset["validation"].set_transform(val_transforms) # Initalize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=feature_extractor, data_collator=collate_fn, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "image-classification", "dataset": data_args.dataset_name, "tags": ["image-classification", "vision"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = ( Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = (args.train_file if args.train_file is not None else args.validation_file).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ignore_mismatched_sizes=args.ignore_mismatched_sizes, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): logger.info( f"The configuration of the model provided the following label correspondence: {label_name_to_id}. " "Using it!" ) label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = {id: label for label, id in config.label2id.items()} elif args.task_name is not None and not is_regression: model.config.label2id = {l: i for i, l in enumerate(label_list)} model.config.id2label = {id: label for label, id in config.label2id.items()} padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = [label_to_id[l] for l in examples["label"]] else: # In all cases, rename the column to labels because the model will expect that. result["labels"] = examples["label"] return result with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) # We need to recalculate our total training steps as the size of the training dataloader may have changed num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value accelerator.init_trackers("glue_no_trainer", experiment_config) # Get the metric function if args.task_name is not None: metric = evaluate.load("glue", args.task_name) else: metric = evaluate.load("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze() predictions, references = accelerator.gather((predictions, batch["labels"])) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] references = references[: len(eval_dataloader.dataset) - samples_seen] else: samples_seen += references.shape[0] metric.add_batch( predictions=predictions, references=references, ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.with_tracking: accelerator.log( { "accuracy" if args.task_name is not None else "glue": eval_metric, "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size ) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}") if args.output_dir is not None: with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( "glue", data_args.task_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = {"train": data_args.train_file, "validation": data_args.validation_file} # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError("Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files raw_datasets = load_dataset( "csv", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from local json files raw_datasets = load_dataset( "json", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) # Preprocessing the raw_datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: model.config.label2id = {l: i for i, l in enumerate(label_list)} model.config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] return result with training_args.main_process_first(desc="dataset map pre-processing"): raw_datasets = raw_datasets.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if "test" not in raw_datasets and "test_matched" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select(range(max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) else: metric = evaluate.load("accuracy") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids) ** 2).mean().item()} else: return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if # we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(raw_datasets["validation_mismatched"]) combined = {} for eval_dataset, task in zip(eval_datasets, tasks): metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) ) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) if task == "mnli-mm": metrics = {k + "_mm": v for k, v in metrics.items()} if task is not None and "mnli" in task: combined.update(metrics) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics) if training_args.do_predict: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] predict_datasets = [predict_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") predict_datasets.append(raw_datasets["test_mismatched"]) for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset = predict_dataset.remove_columns("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: logger.info(f"***** Predict results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} if data_args.task_name is not None: kwargs["language"] = "en" kwargs["dataset_tags"] = "glue" kwargs["dataset_args"] = data_args.task_name kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args, framework="flax") # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Setup logging, we only want one process per machine to log things on the screen. logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR) if jax.process_index() == 0: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name( Path(training_args.output_dir).absolute().name, token=training_args.hub_token ) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( "glue", data_args.task_name, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading the dataset from local csv or json file. data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, use_fast=not model_args.use_slow_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) model = FlaxAutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): logger.info( f"The configuration of the model provided the following label correspondence: {label_name_to_id}. " "Using it!" ) label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} def preprocess_function(examples): # Tokenize the texts texts = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*texts, padding="max_length", max_length=data_args.max_seq_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = [label_to_id[l] for l in examples["label"]] else: # In all cases, rename the column to labels because the model will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Define a summary writer has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter(training_args.output_dir) summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)}) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable." ) def write_train_metric(summary_writer, train_metrics, train_time, step): summary_writer.scalar("train_time", train_time, step) train_metrics = get_metrics(train_metrics) for key, vals in train_metrics.items(): tag = f"train_{key}" for i, val in enumerate(vals): summary_writer.scalar(tag, val, step - len(vals) + i + 1) def write_eval_metric(summary_writer, eval_metrics, step): for metric_name, value in eval_metrics.items(): summary_writer.scalar(f"eval_{metric_name}", value, step) num_epochs = int(training_args.num_train_epochs) rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) train_batch_size = int(training_args.per_device_train_batch_size) * jax.local_device_count() per_device_eval_batch_size = int(training_args.per_device_eval_batch_size) eval_batch_size = per_device_eval_batch_size * jax.device_count() learning_rate_fn = create_learning_rate_fn( len(train_dataset), train_batch_size, training_args.num_train_epochs, training_args.warmup_steps, training_args.learning_rate, ) state = create_train_state( model, learning_rate_fn, is_regression, num_labels=num_labels, weight_decay=training_args.weight_decay ) # define step functions def train_step( state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey ) -> Tuple[train_state.TrainState, float]: """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`.""" dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) targets = batch.pop("labels") def loss_fn(params): logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] loss = state.loss_fn(logits, targets) return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch") return new_state, metrics, new_dropout_rng p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,)) def eval_step(state, batch): logits = state.apply_fn(**batch, params=state.params, train=False)[0] return state.logits_fn(logits) p_eval_step = jax.pmap(eval_step, axis_name="batch") if data_args.task_name is not None: metric = evaluate.load("glue", data_args.task_name) else: metric = evaluate.load("accuracy") logger.info(f"===== Starting training ({num_epochs} epochs) =====") train_time = 0 # make sure weights are replicated on each device state = replicate(state) steps_per_epoch = len(train_dataset) // train_batch_size total_steps = steps_per_epoch * num_epochs epochs = tqdm(range(num_epochs), desc=f"Epoch ... (0/{num_epochs})", position=0) for epoch in epochs: train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # train train_loader = glue_train_data_collator(input_rng, train_dataset, train_batch_size) for step, batch in enumerate( tqdm( train_loader, total=steps_per_epoch, desc="Training...", position=1, ), ): state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs) train_metrics.append(train_metric) cur_step = (epoch * steps_per_epoch) + (step + 1) if cur_step % training_args.logging_steps == 0 and cur_step > 0: # Save metrics train_metric = unreplicate(train_metric) train_time += time.time() - train_start if has_tensorboard and jax.process_index() == 0: write_train_metric(summary_writer, train_metrics, train_time, cur_step) epochs.write( f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate:" f" {train_metric['learning_rate']})" ) train_metrics = [] if (cur_step % training_args.eval_steps == 0 or cur_step % steps_per_epoch == 0) and cur_step > 0: # evaluate eval_loader = glue_eval_data_collator(eval_dataset, eval_batch_size) for batch in tqdm( eval_loader, total=math.ceil(len(eval_dataset) / eval_batch_size), desc="Evaluating ...", position=2, ): labels = batch.pop("labels") predictions = pad_shard_unpad(p_eval_step)( state, batch, min_device_batch=per_device_eval_batch_size ) metric.add_batch(predictions=np.array(predictions), references=labels) eval_metric = metric.compute() logger.info(f"Step... ({cur_step}/{total_steps} | Eval metrics: {eval_metric})") if has_tensorboard and jax.process_index() == 0: write_eval_metric(summary_writer, eval_metric, cur_step) if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps): # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(unreplicate(state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False) epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}" # save the eval metrics in json if jax.process_index() == 0: eval_metric = {f"eval_{metric_name}": value for metric_name, value in eval_metric.items()} path = os.path.join(training_args.output_dir, "eval_results.json") with open(path, "w") as f: json.dump(eval_metric, f, indent=4, sort_keys=True)
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_qa_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator_log_kwargs = {} if args.with_tracking: accelerator_log_kwargs["log_with"] = args.report_to accelerator_log_kwargs["logging_dir"] = args.output_dir accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForQuestionAnswering.from_config(config) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset with accelerator.main_process_first(): train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [ q.lstrip() for q in examples[question_column_name] ] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation with accelerator.main_process_first(): eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(args.max_eval_samples)) if args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(args.max_predict_samples)) # Predict Feature Creation with accelerator.main_process_first(): predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"]) eval_dataloader = DataLoader(eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) if args.do_predict: predict_dataset_for_model = predict_dataset.remove_columns( ["example_id", "offset_mapping"]) predict_dataloader = DataLoader( predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=args.version_2_with_negative, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, null_score_diff_threshold=args.null_score_diff_threshold, output_dir=args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = evaluate.load( "squad_v2" if args.version_2_with_negative else "squad") # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor def create_and_fill_np_array(start_or_end_logits, dataset, max_len): """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor Args: start_or_end_logits(:obj:`tensor`): This is the output predictions of the model. We can only enter either start or end logits. eval_dataset: Evaluation dataset max_len(:obj:`int`): The maximum length of the output tensor. ( See the model.eval() part for more details ) """ step = 0 # create a numpy array and fill it with -100. logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64) # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather_for_metrics for i, output_logit in enumerate( start_or_end_logits): # populate columns # We have to fill it such that we have to take the whole tensor and replace it on the newly created array # And after every iteration we have to change the step batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step:step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[:len(dataset) - step] step += batch_size return logits_concat # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("qa_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) # Evaluation logger.info("***** Running Evaluation *****") logger.info(f" Num examples = {len(eval_dataset)}") logger.info(f" Batch size = {args.per_device_eval_batch_size}") all_start_logits = [] all_end_logits = [] model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_logits = outputs.start_logits end_logits = outputs.end_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100) end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100) all_start_logits.append( accelerator.gather_for_metrics(start_logits).cpu().numpy()) all_end_logits.append( accelerator.gather_for_metrics(end_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_start_logits ]) # Get the max_length of the tensor # concatenate the numpy array start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len) end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len) # delete the list of numpy arrays del all_start_logits del all_end_logits outputs_numpy = (start_logits_concat, end_logits_concat) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Evaluation metrics: {eval_metric}") # Prediction if args.do_predict: logger.info("***** Running Prediction *****") logger.info(f" Num examples = {len(predict_dataset)}") logger.info(f" Batch size = {args.per_device_eval_batch_size}") all_start_logits = [] all_end_logits = [] model.eval() for step, batch in enumerate(predict_dataloader): with torch.no_grad(): outputs = model(**batch) start_logits = outputs.start_logits end_logits = outputs.end_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_logits = accelerator.pad_across_processes( start_logits, dim=1, pad_index=-100) end_logits = accelerator.pad_across_processes( end_logits, dim=1, pad_index=-100) all_start_logits.append( accelerator.gather_for_metrics(start_logits).cpu().numpy()) all_end_logits.append( accelerator.gather_for_metrics(end_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_start_logits ]) # Get the max_length of the tensor # concatenate the numpy array start_logits_concat = create_and_fill_np_array(all_start_logits, predict_dataset, max_len) end_logits_concat = create_and_fill_np_array(all_end_logits, predict_dataset, max_len) # delete the list of numpy arrays del all_start_logits del all_end_logits outputs_numpy = (start_logits_concat, end_logits_concat) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") if args.with_tracking: log = { "squad_v2" if args.version_2_with_negative else "squad": eval_metric, "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, } if args.do_predict: log["squad_v2_predict" if args. version_2_with_negative else "squad_predict"] = predict_metric accelerator.log(log, step=completed_steps) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) logger.info(json.dumps(eval_metric, indent=4)) save_prefixed_metrics(eval_metric, args.output_dir)
def main(): # 1. Parse input arguments # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args) # 2. Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # 3. Detecting last checkpoint and eventualy continue from last checkpoint last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # 4. Load dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=True if model_args.use_auth_token else None, ) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=True if model_args.use_auth_token else None, ) if data_args.audio_column_name not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") if data_args.text_column_name not in next(iter( raw_datasets.values())).column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(next(iter(raw_datasets.values())).column_names)}.") # 5. Load pretrained model, tokenizer, and feature extractor # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # 7. Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers text_column_name = data_args.text_column_name model_input_name = feature_extractor.model_input_names[0] do_lower_case = data_args.do_lower_case if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) def prepare_dataset(batch): # process audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) # process audio length batch[model_input_name] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # process targets input_str = batch[text_column_name].lower( ) if do_lower_case else batch[text_column_name] batch["labels"] = tokenizer(input_str).input_ids return batch with training_args.main_process_first(desc="dataset map pre-processing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess train dataset", ) # filter data that is shorter than min_input_length or longer than # max_input_length def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # for large datasets it is advised to run the preprocessing on a # single machine first with `args.preprocessing_only` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step `args.preprocessing_only` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: cache = {k: v.cache_files for k, v in vectorized_datasets.items()} logger.info(f"Data preprocessing finished. Files cached at {cache}.") return # 8. Load Metric metric = evaluate.load("wer") def compute_metrics(pred): pred_ids = pred.predictions pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) wer = metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # 9. Create a single speech processor if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) processor = AutoProcessor.from_pretrained(training_args.output_dir) # 10. Define data collator data_collator = DataCollatorSpeechSeq2SeqWithPadding( processor=processor, decoder_start_token_id=model.config.decoder_start_token_id) # 11. Initialize Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # 12. Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the feature extractor too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # 13. Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="eval", max_length=model.config.max_length, num_beams=model.config.num_beams) max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # 14. Write Training Stats kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech recognition" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_translation", model_args, data_args, framework="tensorflow") # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity(logging.INFO) transformers.utils.logging.set_verbosity(logging.INFO) # Log on each process the small summary: logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Detecting last checkpoint last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load model config and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # endregion # region Dataset preprocessing # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, and/or `do_eval`." ) return column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): assert data_args.target_lang is not None and data_args.source_lang is not None, ( f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " "--target_lang arguments.") tokenizer.src_lang = data_args.source_lang tokenizer.tgt_lang = data_args.target_lang forced_bos_token_id = ( tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None) # Get the language codes for input/target. source_lang = data_args.source_lang.split("_")[0] target_lang = data_args.target_lang.split("_")[0] padding = "max_length" if data_args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) else: train_dataset = None if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) else: eval_dataset = None # endregion with training_args.strategy.scope(): # region Prepare model model = TFAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): model.config.forced_bos_token_id = forced_bos_token_id # endregion # region Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert (data_args.target_lang is not None and data_args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # endregion # region Prepare TF Dataset objects label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of= 64, # Reduce the number of unique shapes for XLA, especially for generation return_tensors="tf", ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=data_collator, batch_size=total_train_batch_size, shuffle=True, ).with_options(dataset_options) tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False).with_options(dataset_options) # endregion # region Optimizer and LR scheduling num_train_steps = int( len(tf_train_dataset) * training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 if training_args.do_train: optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = None # endregion # region Metric and postprocessing if training_args.do_eval: metric = evaluate.load("sacrebleu") if data_args.val_max_target_length is None: data_args.val_max_target_length = data_args.max_target_length gen_kwargs = { "max_length": data_args.val_max_target_length, "num_beams": data_args.num_beams, "no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default } def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(preds): predictions, labels = preds if isinstance(predictions, tuple): predictions = predictions[0] decoded_preds = tokenizer.batch_decode( predictions, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metrics = metric.compute(predictions=decoded_preds, references=decoded_labels) return {"bleu": metrics["score"]} # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs. # For more information, see the docs at # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback metric_callback = KerasMetricCallback( metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, predict_with_generate=True, use_xla_generation=True, generate_kwargs=gen_kwargs, ) callbacks = [metric_callback] else: callbacks = [] # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "translation" } if data_args.dataset_name is not None: model_card_kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: model_card_kwargs[ "dataset_args"] = data_args.dataset_config_name model_card_kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: model_card_kwargs["dataset"] = data_args.dataset_name languages = [ l for l in [data_args.source_lang, data_args.target_lang] if l is not None ] if len(languages) > 0: model_card_kwargs["language"] = languages if training_args.push_to_hub: # Because this training can be quite long, we save once per epoch. callbacks.append( PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, )) # endregion # region Training eval_metrics = None model.compile(optimizer=optimizer, jit_compile=training_args.xla) if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") if training_args.xla and not data_args.pad_to_max_length: logger.warning( "XLA training may be slow at first when --pad_to_max_length is not set " "until all possible shapes have been compiled.") history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) eval_metrics = { key: val[-1] for key, val in history.history.items() } # endregion # region Validation if training_args.do_eval and not training_args.do_train: # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate @tf.function(jit_compile=True) def generate(**kwargs): return model.generate(**kwargs) if training_args.do_eval: logger.info("Evaluation...") for batch, labels in tf_eval_dataset: batch.update(gen_kwargs) generated_tokens = generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metrics = metric.compute() logger.info({"bleu": eval_metrics["score"]}) # endregion if training_args.output_dir is not None and eval_metrics is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metrics)) if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_image_classification_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator_log_kwargs = {} if args.with_tracking: accelerator_log_kwargs["log_with"] = args.report_to accelerator_log_kwargs["logging_dir"] = args.output_dir accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) logger.info(accelerator.state) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own training and evaluation files (see below) # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset(args.dataset_name, task="image-classification") else: data_files = {} if args.train_dir is not None: data_files["train"] = os.path.join(args.train_dir, "**") if args.validation_dir is not None: data_files["validation"] = os.path.join(args.validation_dir, "**") dataset = load_dataset( "imagefolder", data_files=data_files, cache_dir=args.cache_dir, task="image-classification", ) # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder. # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys( ) else args.train_val_split if isinstance(args.train_val_split, float) and args.train_val_split > 0.0: split = dataset["train"].train_test_split(args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. labels = dataset["train"].features["labels"].names label2id = {label: str(i) for i, label in enumerate(labels)} id2label = {str(i): label for i, label in enumerate(labels)} # Load pretrained model and feature extractor # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( args.model_name_or_path, num_labels=len(labels), i2label=id2label, label2id=label2id, finetuning_task="image-classification", ) feature_extractor = AutoFeatureExtractor.from_pretrained( args.model_name_or_path) model = AutoModelForImageClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ignore_mismatched_sizes=args.ignore_mismatched_sizes, ) # Preprocessing the datasets # Define torchvision transforms to be applied to each image. normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) train_transforms = Compose([ RandomResizedCrop(feature_extractor.size), RandomHorizontalFlip(), ToTensor(), normalize, ]) val_transforms = Compose([ Resize(feature_extractor.size), CenterCrop(feature_extractor.size), ToTensor(), normalize, ]) def preprocess_train(example_batch): """Apply _train_transforms across a batch.""" example_batch["pixel_values"] = [ train_transforms(image.convert("RGB")) for image in example_batch["image"] ] return example_batch def preprocess_val(example_batch): """Apply _val_transforms across a batch.""" example_batch["pixel_values"] = [ val_transforms(image.convert("RGB")) for image in example_batch["image"] ] return example_batch with accelerator.main_process_first(): if args.max_train_samples is not None: dataset["train"] = dataset["train"].shuffle(seed=args.seed).select( range(args.max_train_samples)) # Set the training transforms train_dataset = dataset["train"].with_transform(preprocess_train) if args.max_eval_samples is not None: dataset["validation"] = dataset["validation"].shuffle( seed=args.seed).select(range(args.max_eval_samples)) # Set the validation transforms eval_dataset = dataset["validation"].with_transform(preprocess_val) # DataLoaders creation: def collate_fn(examples): pixel_values = torch.stack( [example["pixel_values"] for example in examples]) labels = torch.tensor([example["labels"] for example in examples]) return {"pixel_values": pixel_values, "labels": labels} train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("image_classification_no_trainer", experiment_config) # Get the metric function metric = evaluate.load("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, ) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message= f"Training in progress {completed_steps} steps", blocking=False, auto_lfs_prune=True, ) if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) predictions, references = accelerator.gather_for_metrics( (predictions, batch["labels"])) metric.add_batch( predictions=predictions, references=references, ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.with_tracking: accelerator.log( { "accuracy": eval_metric, "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) if args.output_dir is not None: with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
def annotate_documents_(source_path, save_path): documents = load(source_path) annotate_documents(documents, save_path)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_semantic_segmentation", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Load dataset # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: dataset = dataset.rename_columns({"pixel_values": "image"}) if "annotation" in dataset["train"].column_names: dataset = dataset.rename_columns({"annotation": "label"}) # If we don't have a validation split, split off a percentage of train as validation. data_args.train_val_split = None if "validation" in dataset.keys( ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = dataset["train"].train_test_split(data_args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. if data_args.dataset_name == "scene_parse_150": repo_id = "datasets/huggingface/label-files" filename = "ade20k-id2label.json" else: repo_id = f"datasets/{data_args.dataset_name}" filename = "id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} label2id = {v: str(k) for k, v in id2label.items()} # Load the mean IoU metric from the datasets package metric = evaluate.load("mean_iou") # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. @torch.no_grad() def compute_metrics(eval_pred): logits, labels = eval_pred logits_tensor = torch.from_numpy(logits) # scale the logits to the size of the label logits_tensor = nn.functional.interpolate( logits_tensor, size=labels.shape[-2:], mode="bilinear", align_corners=False, ).argmax(dim=1) pred_labels = logits_tensor.detach().cpu().numpy() metrics = metric.compute( predictions=pred_labels, references=labels, num_labels=len(id2label), ignore_index=0, reduce_labels=feature_extractor.reduce_labels, ) # add per category metrics as individual key-value pairs per_category_accuracy = metrics.pop("per_category_accuracy").tolist() per_category_iou = metrics.pop("per_category_iou").tolist() metrics.update({ f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy) }) metrics.update( {f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)}) return metrics config = AutoConfig.from_pretrained( model_args.config_name or model_args.model_name_or_path, label2id=label2id, id2label=id2label, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSemanticSegmentation.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.feature_extractor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Define torchvision transforms to be applied to each image + target. # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py train_transforms = Compose([ ReduceLabels() if data_args.reduce_labels else Identity(), RandomCrop(size=feature_extractor.size), RandomHorizontalFlip(flip_prob=0.5), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) # Define torchvision transform to be applied to each image. # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) val_transforms = Compose([ ReduceLabels() if data_args.reduce_labels else Identity(), Resize(size=(feature_extractor.size, feature_extractor.size)), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_train(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = train_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding def preprocess_val(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = val_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: dataset["train"] = (dataset["train"].shuffle( seed=training_args.seed).select( range(data_args.max_train_samples))) # Set the training transforms dataset["train"].set_transform(preprocess_train) if training_args.do_eval: if "validation" not in dataset: raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: dataset["validation"] = (dataset["validation"].shuffle( seed=training_args.seed).select( range(data_args.max_eval_samples))) # Set the validation transforms dataset["validation"].set_transform(preprocess_val) # Initalize our trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=feature_extractor, data_collator=default_data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() # Evaluation if training_args.do_eval: metrics = trainer.evaluate() trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "dataset": data_args.dataset_name, "tags": ["image-segmentation", "vision"], } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args, framework="tensorflow") if not (training_args.do_train or training_args.do_eval or training_args.do_predict): exit( "Must specify at least one of --do_train, --do_eval or --do_predict!" ) # endregion # region Checkpoints checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: checkpoint = get_last_checkpoint(training_args.output_dir) if checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Dataset and labels # Set seed before initializing model. set_seed(training_args.seed) # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. datasets = load_dataset( "glue", data_args.task_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 if data_args.predict_file is not None: logger.info("Preparing user-supplied file for predictions...") data_files = {"data": data_args.predict_file} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.predict_file.endswith(".csv"): # Loading a dataset from local csv files user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) needed_keys = task_to_keys[data_args.task_name] for key in needed_keys: assert key in user_dataset[ "data"].features, f"Your supplied predict_file is missing the {key} key!" datasets["user_data"] = user_dataset["data"] # endregion # region Load model config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if config.label2id != PretrainedConfig( num_labels=num_labels).label2id and not is_regression: # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {label: i for i, label in enumerate(label_list)} if label_to_id is not None: config.label2id = label_to_id config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: config.label2id = {l: i for i, l in enumerate(label_list)} config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion # region Metric function metric = evaluate.load("glue", data_args.task_name) def compute_metrics(preds, label_ids): preds = preds["logits"] preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) result = metric.compute(predictions=preds, references=label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result # endregion with training_args.strategy.scope(): # region Load pretrained model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, "validation_matched": data_args.max_eval_samples, "validation_mismatched": data_args.max_eval_samples, "test": data_args.max_predict_samples, "test_matched": data_args.max_predict_samples, "test_mismatched": data_args.max_predict_samples, "user_data": None, } for key in datasets.keys(): if key == "train" or key.startswith("validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = dataset.to_tf_dataset( columns=[ col for col in dataset.column_names if col not in set(non_label_column_names + ["label"]) ], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, drop_remainder=drop_remainder, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion # region Training and validation if training_args.do_train: callbacks = [ SavePretrainedCallback(output_dir=training_args.output_dir) ] if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets validation_data = tf_data["validation"] else: validation_data = None model.fit( tf_data["train"], validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Evaluation if training_args.do_eval: # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) if data_args.task_name == "mnli": tasks = ["mnli", "mnli-mm"] tf_datasets = [ tf_data["validation_matched"], tf_data["validation_mismatched"] ] raw_datasets = [ datasets["validation_matched"], datasets["validation_mismatched"] ] else: tasks = [data_args.task_name] tf_datasets = [tf_data["validation"]] raw_datasets = [datasets["validation"]] for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): eval_predictions = model.predict(tf_dataset) eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) # endregion # region Prediction if training_args.do_predict or data_args.predict_file: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [] tf_datasets = [] raw_datasets = [] if training_args.do_predict: if data_args.task_name == "mnli": tasks.extend(["mnli", "mnli-mm"]) tf_datasets.extend( [tf_data["test_matched"], tf_data["test_mismatched"]]) raw_datasets.extend([ datasets["test_matched"], datasets["test_mismatched"] ]) else: tasks.append(data_args.task_name) tf_datasets.append(tf_data["test"]) raw_datasets.append(datasets["test"]) if data_args.predict_file: tasks.append("user_data") tf_datasets.append(tf_data["user_data"]) raw_datasets.append(datasets["user_data"]) for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): test_predictions = model.predict(tf_dataset) if "label" in raw_dataset: test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) print(f"Test metrics ({task}):") print(test_metrics) if is_regression: predictions_to_write = np.squeeze( test_predictions["logits"]) else: predictions_to_write = np.argmax( test_predictions["logits"], axis=1) output_predict_file = os.path.join( training_args.output_dir, f"predict_results_{task}.txt") with open(output_predict_file, "w") as writer: logger.info( f"***** Writing prediction results for {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions_to_write): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = model.config.id2label[item] writer.write(f"{index}\t{item}\n")
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_semantic_segmentation_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = (Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. # We set device_specific to True as we want different data augmentation per device. if args.seed is not None: set_seed(args.seed, device_specific=True) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Load dataset # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # TODO support datasets from local folders dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir) # Rename column names to standardized names (only "image" and "label" need to be present) if "pixel_values" in dataset["train"].column_names: dataset = dataset.rename_columns({"pixel_values": "image"}) if "annotation" in dataset["train"].column_names: dataset = dataset.rename_columns({"annotation": "label"}) # If we don't have a validation split, split off a percentage of train as validation. args.train_val_split = None if "validation" in dataset.keys( ) else args.train_val_split if isinstance(args.train_val_split, float) and args.train_val_split > 0.0: split = dataset["train"].train_test_split(args.train_val_split) dataset["train"] = split["train"] dataset["validation"] = split["test"] # Prepare label mappings. # We'll include these in the model's config to get human readable labels in the Inference API. if args.dataset_name == "scene_parse_150": repo_id = "datasets/huggingface/label-files" filename = "ade20k-id2label.json" else: repo_id = f"datasets/{args.dataset_name}" filename = "id2label.json" id2label = json.load(open(hf_hub_download(repo_id, filename), "r")) id2label = {int(k): v for k, v in id2label.items()} label2id = {v: k for k, v in id2label.items()} # Load pretrained model and feature extractor config = AutoConfig.from_pretrained(args.model_name_or_path, id2label=id2label, label2id=label2id) feature_extractor = AutoFeatureExtractor.from_pretrained( args.model_name_or_path) model = AutoModelForSemanticSegmentation.from_pretrained( args.model_name_or_path, config=config) # Preprocessing the datasets # Define torchvision transforms to be applied to each image + target. # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py train_transforms = Compose([ ReduceLabels() if args.reduce_labels else Identity(), RandomCrop(size=feature_extractor.size), RandomHorizontalFlip(flip_prob=0.5), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) # Define torchvision transform to be applied to each image. # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) val_transforms = Compose([ ReduceLabels() if args.reduce_labels else Identity(), Resize(size=(feature_extractor.size, feature_extractor.size)), PILToTensor(), ConvertImageDtype(torch.float), Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std), ]) def preprocess_train(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = train_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding def preprocess_val(example_batch): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): image, target = val_transforms(image.convert("RGB"), target) pixel_values.append(image) labels.append(target) encoding = dict() encoding["pixel_values"] = torch.stack(pixel_values) encoding["labels"] = torch.stack(labels) return encoding with accelerator.main_process_first(): train_dataset = dataset["train"].with_transform(preprocess_train) eval_dataset = dataset["validation"].with_transform(preprocess_val) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer optimizer = torch.optim.AdamW( list(model.parameters()), lr=args.learning_rate, betas=[args.adam_beta1, args.adam_beta2], eps=args.adam_epsilon, ) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Instantiate metric metric = evaluate.load("mean_iou") # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("semantic_segmentation_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): if args.with_tracking: total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, ) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message= f"Training in progress {completed_steps} steps", blocking=False, auto_lfs_prune=True, ) if completed_steps >= args.max_train_steps: break logger.info("***** Running evaluation *****") model.eval() samples_seen = 0 for step, batch in enumerate( tqdm(eval_dataloader, disable=not accelerator.is_local_main_process)): with torch.no_grad(): outputs = model(**batch) upsampled_logits = torch.nn.functional.interpolate( outputs.logits, size=batch["labels"].shape[-2:], mode="bilinear", align_corners=False) predictions = upsampled_logits.argmax(dim=1) predictions, references = accelerator.gather( (predictions, batch["labels"])) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: predictions = predictions[:len(eval_dataloader.dataset) - samples_seen] references = references[:len(eval_dataloader.dataset) - samples_seen] else: samples_seen += references.shape[0] metric.add_batch( predictions=predictions, references=references, ) eval_metrics = metric.compute( num_labels=len(id2label), ignore_index=255, reduce_labels=False, # we've already reduced the labels before ) logger.info(f"epoch {epoch}: {eval_metrics}") if args.with_tracking: accelerator.log( { "mean_iou": eval_metrics["mean_iou"], "mean_accuracy": eval_metrics["mean_accuracy"], "overall_accuracy": eval_metrics["overall_accuracy"], "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: feature_extractor.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump( { "eval_overall_accuracy": eval_metrics["overall_accuracy"] }, f)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_xnli", model_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # Downloading and loading xnli dataset from the hub. if training_args.do_train: if model_args.train_language is None: train_dataset = load_dataset( "xnli", model_args.language, split="train", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: train_dataset = load_dataset( "xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) label_list = train_dataset.features["label"].names if training_args.do_eval: eval_dataset = load_dataset( "xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) label_list = eval_dataset.features["label"].names if training_args.do_predict: predict_dataset = load_dataset( "xnli", model_args.language, split="test", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) label_list = predict_dataset.features["label"].names # Labels num_labels = len(label_list) # Load pretrained model and tokenizer # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, do_lower_case=model_args.do_lower_case, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) # Preprocessing the datasets # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False def preprocess_function(examples): # Tokenize the texts return tokenizer( examples["premise"], examples["hypothesis"], padding=padding, max_length=data_args.max_seq_length, truncation=True, ) if training_args.do_train: if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") if training_args.do_eval: if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: if data_args.max_predict_samples is not None: max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select(range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Get the metric function metric = evaluate.load("xnli") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") max_predict_samples = ( data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) ) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n")