예제 #1
0
def init_dataset_for_finetuning(model_args,
                                data_args,
                                training_args,
                                last_checkpoint=None):
    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)
    check_sparsity_callback(model, model_args)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]

    test_dataset = None
    if (data_args.task_name is not None or data_args.test_file is not None):
        if training_args.do_predict:
            test_dataset = tokenized_datasets["test_matched" if data_args.
                                              task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            test_dataset, model, is_regression, tokenized_datasets, label_list,
            config)
예제 #2
0
def run_finetuning_single_task(model_args,
                               data_args,
                               training_args,
                               last_checkpoint=None):
    """On a single task train, evaluate, and save results"""

    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]
    test_dataset = None
    if ((data_args.task_name is not None or data_args.test_file is not None)
            and training_args.do_predict):
        test_dataset = tokenized_datasets["test_matched" if data_args.
                                          task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=init_model(model_args, config, tokenizer),
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)
    if training_args.do_train:
        train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate
    eval_results = {}
    if training_args.do_eval:
        logging.info("*** Evaluate ***")

        # Handle special case of extra validation dataset for MNLI
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(tokenized_datasets["validation_mismatched"])

        eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks,
                                      eval_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
예제 #3
0
def init_dataset_for_finetuning(model_args,
                                data_args,
                                training_args,
                                last_checkpoint=None):

    # TODO
    # edit multi_eval_sets so you can gather not just multiple eval sets
    # for a single task, but eval sets from multiple tasks
    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)
    check_sparsity_callback(model, model_args)
    check_mnli(model_args, data_args.task_name)
    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]

    # Allow multiple eval sets. For now, assume mnli is the only case
    eval_dataset = []
    if data_args.task_name == "mnli":
        if "eval_sets" in training_args.trainer_mixin_args:
            for eval_set in training_args.trainer_mixin_args["eval_sets"]:
                eval_dataset.append(tokenized_datasets[eval_set])
        else:
            eval_dataset.append(tokenized_datasets["validation_matched"])
    else:
        eval_dataset.append(tokenized_datasets["validation"])

    # If only one eval set, no need for a list
    if len(eval_dataset) == 1:
        eval_dataset = eval_dataset[0]

    test_dataset = None
    if (data_args.task_name is not None or data_args.test_file is not None):
        if training_args.do_predict:
            test_dataset = tokenized_datasets["test_matched" if data_args.
                                              task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            test_dataset, model, is_regression, tokenized_datasets, label_list,
            config)