Exemplo n.º 1
0
def init_dataset_for_squad(model_args,
                           data_args,
                           training_args,
                           last_checkpoint=None):

    datasets = init_datasets_squad(data_args, model_args)

    # Place holder for now
    extra_config_kwargs = {}
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args,
                       config,
                       tokenizer,
                       finetuning=True,
                       squad=True)
    check_sparsity_callback(model, model_args)

    logging.info(f"Tokenizing datasets for squad ...")
    (train_dataset,
     eval_dataset,
     eval_examples,
     answer_column_name) = \
        preprocess_datasets_squad(datasets, tokenizer, training_args, data_args)

    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    else:
        pad_to_multiple_of = 8 if training_args.fp16 else None
        data_collator = \
            DataCollatorWithPadding(tokenizer,
                                    pad_to_multiple_of=pad_to_multiple_of)

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            eval_examples, model, answer_column_name)
Exemplo n.º 2
0
def init_dataset_for_finetuning(model_args,
                                data_args,
                                training_args,
                                last_checkpoint=None):
    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)
    check_sparsity_callback(model, model_args)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]

    test_dataset = None
    if (data_args.task_name is not None or data_args.test_file is not None):
        if training_args.do_predict:
            test_dataset = tokenized_datasets["test_matched" if data_args.
                                              task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            test_dataset, model, is_regression, tokenized_datasets, label_list,
            config)
Exemplo n.º 3
0
def run_finetuning_single_task(model_args,
                               data_args,
                               training_args,
                               last_checkpoint=None):
    """On a single task train, evaluate, and save results"""

    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]
    test_dataset = None
    if ((data_args.task_name is not None or data_args.test_file is not None)
            and training_args.do_predict):
        test_dataset = tokenized_datasets["test_matched" if data_args.
                                          task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=init_model(model_args, config, tokenizer),
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)
    if training_args.do_train:
        train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate
    eval_results = {}
    if training_args.do_eval:
        logging.info("*** Evaluate ***")

        # Handle special case of extra validation dataset for MNLI
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(tokenized_datasets["validation_mismatched"])

        eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks,
                                      eval_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
Exemplo n.º 4
0
def run_pretraining(model_args,
                    data_args,
                    training_args,
                    last_checkpoint=None):
    """Pretrain and evaluate a language model"""

    logging.info(f"Pre-training a masked language model.")

    datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args)

    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)

    if tokenized_datasets is None:
        # Tokenizing and preprocessing the datasets for language modeling
        if training_args.do_train:
            column_names = datasets["train"].column_names
        else:
            column_names = datasets["validation"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        logging.info(f"Tokenizing datasets for pretraining ...")
        tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer,
                                                     data_args, column_names,
                                                     text_column_name)

        # Save only if a dataset_path has been defined in the previous steps
        # that will be True only when loading from dataset hub
        if data_args.save_tokenized_data and dataset_path is not None:
            logging.info(f"Saving tokenized dataset to {dataset_path}")
            tokenized_datasets.save_to_disk(dataset_path)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will take care of randomly masking the tokens.
    # argument defined in experiment config
    assert hasattr(transformers, data_args.data_collator), \
        f"Data collator {data_args.data_collator} not available"
    data_collator = getattr(transformers, data_args.data_collator)(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Run hp search or regular training
    if model_args.hp_num_trials >= 1:
        run_hyperparameter_search(
            model_args=model_args,
            config=config,
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )
    else:
        trainer = init_trainer(
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset if training_args.do_train else None,
            eval_dataset=eval_dataset if training_args.do_eval else None,
            model=init_model(model_args, config, tokenizer),
            trainer_class=model_args.trainer_class,
            trainer_callbacks=model_args.trainer_callbacks or None,
        )
        if training_args.do_train:
            train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate in full eval dataset.
    # if using hp search, load best model before running evaluate
    if training_args.do_eval:
        logging.info("*** Evaluate ***")
        evaluate_language_model(trainer, eval_dataset,
                                training_args.output_dir)
Exemplo n.º 5
0
def init_dataset_for_finetuning(model_args,
                                data_args,
                                training_args,
                                last_checkpoint=None):

    # TODO
    # edit multi_eval_sets so you can gather not just multiple eval sets
    # for a single task, but eval sets from multiple tasks
    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)
    check_sparsity_callback(model, model_args)
    check_mnli(model_args, data_args.task_name)
    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]

    # Allow multiple eval sets. For now, assume mnli is the only case
    eval_dataset = []
    if data_args.task_name == "mnli":
        if "eval_sets" in training_args.trainer_mixin_args:
            for eval_set in training_args.trainer_mixin_args["eval_sets"]:
                eval_dataset.append(tokenized_datasets[eval_set])
        else:
            eval_dataset.append(tokenized_datasets["validation_matched"])
    else:
        eval_dataset.append(tokenized_datasets["validation"])

    # If only one eval set, no need for a list
    if len(eval_dataset) == 1:
        eval_dataset = eval_dataset[0]

    test_dataset = None
    if (data_args.task_name is not None or data_args.test_file is not None):
        if training_args.do_predict:
            test_dataset = tokenized_datasets["test_matched" if data_args.
                                              task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            test_dataset, model, is_regression, tokenized_datasets, label_list,
            config)