def run_finetuning_single_task(model_args, data_args, training_args, last_checkpoint=None): """On a single task train, evaluate, and save results""" datasets = init_datasets_task(data_args, training_args) is_regression, label_list, num_labels = get_labels(datasets, data_args) logging.info(f"Training {data_args.task_name} with {num_labels} labels") # For finetuning required to add labels and task name to config kwargs extra_config_kwargs = dict( num_labels=num_labels, finetuning_task=data_args.task_name, ) config = init_config(model_args, extra_config_kwargs=extra_config_kwargs) tokenizer = init_tokenizer(model_args) model = init_model(model_args, config, tokenizer, finetuning=True) # Tokenizing and preprocessing the datasets for downstream tasks # TODO: load from cached tokenized datasets for finetuning as well logging.info(f"Tokenizing datasets for finetuning ...") tokenized_datasets = preprocess_datasets_task(datasets, tokenizer, data_args, model, num_labels, label_list, is_regression) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] test_dataset = None if ((data_args.task_name is not None or data_args.test_file is not None) and training_args.do_predict): test_dataset = tokenized_datasets["test_matched" if data_args. task_name == "mnli" else "test"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will default to DataCollatorWithPadding, # so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate eval_results = {} if training_args.do_eval: logging.info("*** Evaluate ***") # Handle special case of extra validation dataset for MNLI tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(tokenized_datasets["validation_mismatched"]) eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks, eval_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def run_finetuning_single_task( model_args, data_args, training_args, last_checkpoint=None, run_idx=None, ): """On a single task train, evaluate, and save results""" # TODO # accept run# as an argument for finetuning with multiple runs on a single task # update the save directory to include run# tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \ is_regression, tokenized_datasets, label_list, config = \ init_dataset_for_finetuning( model_args, data_args, training_args, last_checkpoint ) # Code safety check_eval_and_max_steps(training_args, train_dataset) training_args = check_best_metric(training_args, data_args.task_name) # Update where model is saved for each run training_args = update_run_number(training_args, run_idx) # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=model, trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression) if training_args.do_train: train(trainer, training_args.output_dir, training_args.rm_checkpoints, last_checkpoint) if training_args.do_eval: eval_results = evaluate_tasks_handler(trainer, data_args, model_args, training_args, eval_dataset, tokenized_datasets) # Test/Predict if training_args.do_predict: logging.info("*** Test ***") # Handle special case of extra test dataset for MNLI tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(tokenized_datasets["test_mismatched"]) test_tasks(trainer, training_args.output_dir, tasks, test_datasets, is_regression, label_list) # TODO # Remove any unnecessary checkpoints to reduce space demands if training_args.load_best_model_at_end: pass # find best model checkpoint # delete the rest # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return eval_results
def run_pretraining(model_args, data_args, training_args, last_checkpoint=None): """Pretrain and evaluate a language model""" logging.info(f"Pre-training a masked language model.") datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args) config = init_config(model_args) tokenizer = init_tokenizer(model_args) if tokenized_datasets is None: # Tokenizing and preprocessing the datasets for language modeling if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] logging.info(f"Tokenizing datasets for pretraining ...") tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer, data_args, column_names, text_column_name) # Save only if a dataset_path has been defined in the previous steps # that will be True only when loading from dataset hub if data_args.save_tokenized_data and dataset_path is not None: logging.info(f"Saving tokenized dataset to {dataset_path}") tokenized_datasets.save_to_disk(dataset_path) # Separate into train, eval and test train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") # Data collator will take care of randomly masking the tokens. # argument defined in experiment config assert hasattr(transformers, data_args.data_collator), \ f"Data collator {data_args.data_collator} not available" data_collator = getattr(transformers, data_args.data_collator)( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Run hp search or regular training if model_args.hp_num_trials >= 1: run_hyperparameter_search( model_args=model_args, config=config, tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) else: trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, model=init_model(model_args, config, tokenizer), trainer_class=model_args.trainer_class, trainer_callbacks=model_args.trainer_callbacks or None, ) if training_args.do_train: train(trainer, training_args.output_dir, last_checkpoint) # Evaluate in full eval dataset. # if using hp search, load best model before running evaluate if training_args.do_eval: logging.info("*** Evaluate ***") evaluate_language_model(trainer, eval_dataset, training_args.output_dir)
def run_finetuning_single_task_with_hp_search(model_args, data_args, training_args, last_checkpoint=None): """On a single task train, evaluate, and save results""" # Init dataset (same as without hp search) tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \ is_regression, tokenized_datasets, label_list, config = \ init_dataset_for_finetuning( model_args, data_args, training_args, last_checkpoint, ) # Defines defaults required for hp search training_args.load_best_model_at_end = True training_args.disable_tqdm = True # competes with ray output training_args.metric_for_best_model = model_args.hp_compute_objective[1] training_args.do_eval = False training_args.do_predict = False # Code safety run a second time due to training_args being changed above check_eval_and_max_steps(training_args, train_dataset) training_args = check_best_metric(training_args, data_args.task_name) model_args = check_hp_compute_objective(model_args, data_args.task_name) check_sparsity_callback(model, model_args) # Get fraction of the validation dataset to use in hp search if model_args.hp_validation_dataset_pct < 1: hp_eval_dataset = eval_dataset.shard( index=1, num_shards=int(1 / model_args.hp_validation_dataset_pct)) else: hp_eval_dataset = eval_dataset # Specify how to re-init model each training run. def model_init(): model_kwargs = dict( from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, **model_kwargs) check_sparsity_callback(model, model_args) return model # Train trainer = init_trainer( tokenizer=tokenizer, data_collator=data_collator, training_args=training_args, train_dataset=train_dataset, eval_dataset=hp_eval_dataset, trainer_class=model_args.trainer_class, # changed trainer_callbacks=model_args.trainer_callbacks or None, model_init=model_init, # changed task_name=data_args.task_name, # does it matter? is_regression=is_regression, # does it matter? finetuning=True # see if it fixes key error issue ) hp_search_kwargs = dict( direction=model_args.hp_compute_objective[0], backend="ray", n_trials=model_args.hp_num_trials, hp_space=model_args.hp_space, compute_objective=partial( compute_objective, objective=model_args.hp_compute_objective[1]), local_dir=training_args.output_dir, resources_per_trial=model_args.hp_resources_per_trial, checkpoint_freq=0, keep_checkpoints_num=0, checkpoint_at_end=False, ) # TODO # Get wandb to log properly # trainer.hyperparameter_search calls ray.tune() # you can set config or callbacks as a kwarg to trainer.hyperparameter_search # which gets passed to ray.tune # Update any extra kwargs defined in config hp_search_kwargs.update(**model_args.hp_extra_kwargs) # Run hp search and save results best_run = trainer.hyperparameter_search(**hp_search_kwargs) logging.info(f"Best run: {best_run}") # Delete all saved models and checkpoints to save space. All we need # are the params and scores. Currently this is a hack that is specific # to ray. rm_prefixed_subdirs(training_args.output_dir, "run-") hp_res_file_name = f"best_run_results_{model_args.hp_compute_objective[1]}.txt" hp_res_file = os.path.join(training_args.output_dir, hp_res_file_name) write_new = check_if_current_hp_best(hp_res_file, model_args, best_run) if trainer.is_world_process_zero() and write_new: with open(hp_res_file, "w") as writer: writer.write("Hyperparameter search best run:\n") writer.write(f"run_id = {best_run.run_id}\n") writer.write(f"{training_args.metric_for_best_model}") writer.write(f"= {best_run.objective}\n") writer.write(f"\nHyperparameters:\n") for key, value in sorted(best_run.hyperparameters.items()): writer.write(f"{key} = {value}\n") # There is an existing issue on training multiple models in sequence in this code # There is a memory leakage on the model, a small amount of GPU memory remains after # the run and accumulates over several runs. It fails with OOM after about 20 runs, # even when all tensors on GPU are explicitly deleted, garbage is collected and # cache is cleared. Tried multiple solutions but this weird little hack is the only # thing that worked. model.to("cpu") return {}