示例#1
0
def run_finetuning_single_task(model_args,
                               data_args,
                               training_args,
                               last_checkpoint=None):
    """On a single task train, evaluate, and save results"""

    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]
    test_dataset = None
    if ((data_args.task_name is not None or data_args.test_file is not None)
            and training_args.do_predict):
        test_dataset = tokenized_datasets["test_matched" if data_args.
                                          task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=init_model(model_args, config, tokenizer),
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)
    if training_args.do_train:
        train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate
    eval_results = {}
    if training_args.do_eval:
        logging.info("*** Evaluate ***")

        # Handle special case of extra validation dataset for MNLI
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(tokenized_datasets["validation_mismatched"])

        eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks,
                                      eval_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
示例#2
0
def run_finetuning_single_task(
    model_args,
    data_args,
    training_args,
    last_checkpoint=None,
    run_idx=None,
):
    """On a single task train, evaluate, and save results"""

    # TODO
    # accept run# as an argument for finetuning with multiple runs on a single task
    # update the save directory to include run#
    tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \
        is_regression, tokenized_datasets, label_list, config = \
        init_dataset_for_finetuning(
            model_args, data_args, training_args, last_checkpoint
        )

    # Code safety
    check_eval_and_max_steps(training_args, train_dataset)
    training_args = check_best_metric(training_args, data_args.task_name)

    # Update where model is saved for each run
    training_args = update_run_number(training_args, run_idx)

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=model,
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)

    if training_args.do_train:
        train(trainer, training_args.output_dir, training_args.rm_checkpoints,
              last_checkpoint)

    if training_args.do_eval:
        eval_results = evaluate_tasks_handler(trainer, data_args, model_args,
                                              training_args, eval_dataset,
                                              tokenized_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # TODO
    # Remove any unnecessary checkpoints to reduce space demands
    if training_args.load_best_model_at_end:
        pass
        # find best model checkpoint
        # delete the rest

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
示例#3
0
def run_pretraining(model_args,
                    data_args,
                    training_args,
                    last_checkpoint=None):
    """Pretrain and evaluate a language model"""

    logging.info(f"Pre-training a masked language model.")

    datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args)

    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)

    if tokenized_datasets is None:
        # Tokenizing and preprocessing the datasets for language modeling
        if training_args.do_train:
            column_names = datasets["train"].column_names
        else:
            column_names = datasets["validation"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        logging.info(f"Tokenizing datasets for pretraining ...")
        tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer,
                                                     data_args, column_names,
                                                     text_column_name)

        # Save only if a dataset_path has been defined in the previous steps
        # that will be True only when loading from dataset hub
        if data_args.save_tokenized_data and dataset_path is not None:
            logging.info(f"Saving tokenized dataset to {dataset_path}")
            tokenized_datasets.save_to_disk(dataset_path)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will take care of randomly masking the tokens.
    # argument defined in experiment config
    assert hasattr(transformers, data_args.data_collator), \
        f"Data collator {data_args.data_collator} not available"
    data_collator = getattr(transformers, data_args.data_collator)(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Run hp search or regular training
    if model_args.hp_num_trials >= 1:
        run_hyperparameter_search(
            model_args=model_args,
            config=config,
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )
    else:
        trainer = init_trainer(
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset if training_args.do_train else None,
            eval_dataset=eval_dataset if training_args.do_eval else None,
            model=init_model(model_args, config, tokenizer),
            trainer_class=model_args.trainer_class,
            trainer_callbacks=model_args.trainer_callbacks or None,
        )
        if training_args.do_train:
            train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate in full eval dataset.
    # if using hp search, load best model before running evaluate
    if training_args.do_eval:
        logging.info("*** Evaluate ***")
        evaluate_language_model(trainer, eval_dataset,
                                training_args.output_dir)
示例#4
0
def run_finetuning_single_task_with_hp_search(model_args,
                                              data_args,
                                              training_args,
                                              last_checkpoint=None):
    """On a single task train, evaluate, and save results"""

    # Init dataset (same as without hp search)
    tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \
        is_regression, tokenized_datasets, label_list, config = \
        init_dataset_for_finetuning(
            model_args, data_args, training_args, last_checkpoint,
        )

    # Defines defaults required for hp search
    training_args.load_best_model_at_end = True
    training_args.disable_tqdm = True  # competes with ray output
    training_args.metric_for_best_model = model_args.hp_compute_objective[1]
    training_args.do_eval = False
    training_args.do_predict = False

    # Code safety run a second time due to training_args being changed above
    check_eval_and_max_steps(training_args, train_dataset)
    training_args = check_best_metric(training_args, data_args.task_name)
    model_args = check_hp_compute_objective(model_args, data_args.task_name)
    check_sparsity_callback(model, model_args)

    # Get fraction of the validation dataset to use in hp search
    if model_args.hp_validation_dataset_pct < 1:
        hp_eval_dataset = eval_dataset.shard(
            index=1, num_shards=int(1 / model_args.hp_validation_dataset_pct))
    else:
        hp_eval_dataset = eval_dataset

    # Specify how to re-init model each training run.
    def model_init():
        model_kwargs = dict(
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path, **model_kwargs)

        check_sparsity_callback(model, model_args)
        return model

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset,
        eval_dataset=hp_eval_dataset,
        trainer_class=model_args.trainer_class,  # changed
        trainer_callbacks=model_args.trainer_callbacks or None,
        model_init=model_init,  # changed
        task_name=data_args.task_name,  # does it matter?
        is_regression=is_regression,  # does it matter?
        finetuning=True  # see if it fixes key error issue
    )

    hp_search_kwargs = dict(
        direction=model_args.hp_compute_objective[0],
        backend="ray",
        n_trials=model_args.hp_num_trials,
        hp_space=model_args.hp_space,
        compute_objective=partial(
            compute_objective, objective=model_args.hp_compute_objective[1]),
        local_dir=training_args.output_dir,
        resources_per_trial=model_args.hp_resources_per_trial,
        checkpoint_freq=0,
        keep_checkpoints_num=0,
        checkpoint_at_end=False,
    )

    # TODO
    # Get wandb to log properly
    # trainer.hyperparameter_search calls ray.tune()
    # you can set config or callbacks as a kwarg to trainer.hyperparameter_search
    # which gets passed to ray.tune

    # Update any extra kwargs defined in config
    hp_search_kwargs.update(**model_args.hp_extra_kwargs)

    # Run hp search and save results
    best_run = trainer.hyperparameter_search(**hp_search_kwargs)
    logging.info(f"Best run: {best_run}")

    # Delete all saved models and checkpoints to save space. All we need
    # are the params and scores. Currently this is a hack that is specific
    # to ray.
    rm_prefixed_subdirs(training_args.output_dir, "run-")

    hp_res_file_name = f"best_run_results_{model_args.hp_compute_objective[1]}.txt"
    hp_res_file = os.path.join(training_args.output_dir, hp_res_file_name)
    write_new = check_if_current_hp_best(hp_res_file, model_args, best_run)

    if trainer.is_world_process_zero() and write_new:
        with open(hp_res_file, "w") as writer:
            writer.write("Hyperparameter search best run:\n")
            writer.write(f"run_id = {best_run.run_id}\n")
            writer.write(f"{training_args.metric_for_best_model}")
            writer.write(f"= {best_run.objective}\n")
            writer.write(f"\nHyperparameters:\n")
            for key, value in sorted(best_run.hyperparameters.items()):
                writer.write(f"{key} = {value}\n")

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return {}