示例#1
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    if data_args.use_tfds:
        if data_args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically"
            )

        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        tfds_examples = tfds.load("squad", data_dir=data_args.data_dir)
        train_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=False) if training_args.do_train else None)
        eval_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=True) if training_args.do_eval else None)
    else:
        processor = SquadV2Processor(
        ) if data_args.version_2_with_negative else SquadV1Processor()
        train_examples = processor.get_train_examples(
            data_args.data_dir) if training_args.do_train else None
        eval_examples = processor.get_dev_examples(
            data_args.data_dir) if training_args.do_eval else None

    train_dataset = (squad_convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=True,
        return_dataset="tf",
    ) if training_args.do_train else None)

    train_dataset = train_dataset.apply(
        tf.data.experimental.assert_cardinality(len(train_examples)))

    eval_dataset = (squad_convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=False,
        return_dataset="tf",
    ) if training_args.do_eval else None)

    eval_dataset = eval_dataset.apply(
        tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)
示例#2
0
def run_squad_and_get_results(
    run_name: str,
    fsx_prefix: str,
    pre_layer_norm: bool,
    model_size: str,
    load_from: Union[str, tf.keras.Model],
    load_step: int,
    batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
    config: Optional[PretrainedConfig] = None,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000

    if isinstance(load_from, tf.keras.Model):
        config = load_from.config
    assert config is not None, "config may not be None"

    # Instantiate QuestionAnswering model
    if isinstance(load_from, TFPreTrainedModel):
        model = load_qa_from_pretrained(model=load_from)
    elif load_from == "scratch":
        model = TFAutoModelForQuestionAnswering.from_config(config)
    elif load_from == "huggingface":
        model = load_qa_from_pretrained(name=f"albert-{model_size}-v2")
    else:
        raise ValueError(
            f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']"
        )

    tokenizer = get_tokenizer()

    schedule = LinearWarmupLinearDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        optimizer, loss_scale="dynamic")

    model.call = wrap_tf_function_idempotent(model.call)

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = f"{fsx_prefix}/squad_data"

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        batch_size=batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        print("Starting finetuning")
        pbar = tqdm.tqdm(total_steps)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            batch_size=batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    wrapped_train_step = tf.function(train_step)
    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = wrapped_train_step(
            model=model, optimizer=optimizer, batch=batch)

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step
            do_validate = (step % validate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                print("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}")
                print(description)
                print("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model,
                        data_dir=data_dir,
                        filename=val_filename,
                        batch_size=32,
                    )
                print_eval_metrics(results=results, step=step)

            if do_checkpoint:
                checkpoint_path = (
                    f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt"
                )
                print(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{fsx_prefix}/logs/albert-squad/{run_name}")
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision",
                                      val_precision,
                                      step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(summary_writer=summary_writer,
                                             results=results,
                                             step=step)

        if is_final_step:
            break

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        print(f"Finished finetuning, job name {run_name}")
        return results
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_distillation_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)

        try:
            features, dataset, examples = (
                features_and_dataset["features"],
                features_and_dataset["dataset"],
                features_and_dataset["examples"],
            )
        except KeyError:
            raise DeprecationWarning(
                "You seem to be loading features from an older version of this script please delete the "
                "file %s in order for it to be created again" %
                cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir,
                                                  filename=args.predict_file)
        else:
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#4
0
def predict(model_prefix, probes_dir, preds_dir, data_dir, data_file, layers,
            batch_size, hidden_dim, max_seq_length, device):

    # Extract examples
    tokenizer = AutoTokenizer.from_pretrained(model_prefix)
    processor = SquadV2Processor()
    dev_examples = processor.get_dev_examples(data_dir=data_dir,
                                              filename=data_file)

    # Extract dev features
    print("Loading dev features")
    dev_features, dev_dataset = squad_convert_examples_to_features(
        examples=dev_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1)

    # Initialize config and model
    config = AutoConfig.from_pretrained(model_prefix,
                                        output_hidden_states=True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_prefix,
                                                          config=config)

    # multi-gpu evaluate
    model = torch.nn.DataParallel(model)

    # Load probe for each layer
    print("Loading probes")
    probes = []
    for i in range(layers):
        p = Probe(hidden_dim)
        p.load(probes_dir, i + 1, device)
        probes.append(p)

    # Extract IDs
    print("Extracting dev IDs")
    n = len(dev_examples)
    q_ids = []
    for i in range(n):
        q_ids.append(dev_examples[i].qas_id)

    # Initialize dev data loader
    eval_sampler = SequentialSampler(dev_dataset)
    eval_dataloader = DataLoader(dev_dataset,
                                 sampler=eval_sampler,
                                 batch_size=batch_size)

    # Initialize predictions
    predictions = []
    for i in range(layers):
        pred = pd.DataFrame()
        pred['Id'] = q_ids
        pred['Predicted'] = [""] * len(dev_examples)
        pred['Question'] = [""] * len(dev_examples)
        pred['Score'] = [0] * len(dev_examples)
        predictions.append(pred)

    # List to keep track of how many unique questions we've seen in each df, questions with
    # contexts longer than max seq len get split into multiple features based on doc_stride
    # a good alternative we may implement later is recording for all features, then simplifying with groupby and max
    # e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question'])
    question_ids = [0] * layers

    # Evaluation batches
    print("Predicting on dev set")
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            # Distil does not use token type ids
            if "distil" in model_dir:
                inputs.pop('token_type_ids')

            # ALBERT/BERT/Distilibert forward pass
            idx = batch[3]
            outputs = model(**inputs)
            attention_hidden_states = outputs[2][1:]

            # Compute prediction on eval indices
            for j, index in enumerate(idx):
                index = int(index.item())

                # Extract tokens for the current batch
                tokens = tokenizer.convert_ids_to_tokens(batch[0][j])

                # Find where context starts and ends, since we want to predict in context
                context_start = int(max_seq_length - torch.argmax(
                    torch.flip(batch[2][j], [0])).item()) - 1
                context_end = int(torch.argmax(batch[2][j]).item())

                # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token
                question_start = 1
                question_end = context_start
                question = tokenizer.convert_tokens_to_string(
                    tokens[question_start:question_end - 1])

                # For each layer ...
                for i, p in enumerate(probes):

                    # Extract predicted indicies
                    score, start_idx, end_idx = p.predict(
                        attention_hidden_states[i][j].unsqueeze(0),
                        device,
                        threshold=0,
                        context_start=context_start,
                        context_end=context_end)
                    start_idx = int(start_idx[0])
                    end_idx = int(end_idx[0])

                    # Extract predicted answer, converting start tokens to empty strings (no answer)
                    answer = tokenizer.convert_tokens_to_string(
                        tokens[start_idx:end_idx + 1])
                    if answer == '[CLS]':
                        answer = ''

                    # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score.
                    # If the question is not already in the dataframe, then assign it to the dataframe.
                    # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates
                    if question_ids[i] == 0:
                        predictions[i].loc[question_ids[i],
                                           'Question'] = question
                        predictions[i].loc[question_ids[i],
                                           'Predicted'] = answer
                        predictions[i].loc[question_ids[i], 'Score'] = score

                    elif (predictions[i].loc[int(question_ids[i] - 1),
                                             'Question'] == question):
                        question_ids[i] -= 1
                        old_score = predictions[i].loc[question_ids[i],
                                                       'Score']
                        if score > old_score:
                            predictions[i].loc[question_ids[i],
                                               'Predicted'] = answer
                            predictions[i].loc[question_ids[i],
                                               'Score'] = score
                    else:
                        predictions[i].loc[question_ids[i],
                                           'Question'] = question
                        predictions[i].loc[question_ids[i],
                                           'Predicted'] = answer
                        predictions[i].loc[question_ids[i], 'Score'] = score

                    # Increment to new question id (note, for duplicate answers this gets us back to where we were)
                    question_ids[i] += 1

    # Save predictions for each layer
    print("Saving predictions")
    if not os.path.exists(preds_dir):
        os.mkdir(preds_dir)

    for i, pred in enumerate(predictions):
        pred[['Id',
              'Predicted']].to_csv(preds_dir + "/layer_" + str(i + 1) + ".csv",
                                   index=False)
def load_and_cache_examples(data_dir: Path, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False, model_name=None):
    if (task == "SQuAD1.1"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
        train_file = "train-v1.1.json"
        validation_file = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif (task == "SQuAD2.0"):
        train_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/train-v2.0-short.json"
        validation_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/dev-v2.0-short.json"
        train_file = "train-v2.0.json"
        validation_file = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        raise NameError("Incompatible dataset detected")

    if not data_dir.exists():
        data_dir.mkdir(parents=True)
    if evaluate:
        # TODO: Cache instead of always downloading
        with urllib.request.urlopen(validation_url) as url:
            val_path = data_dir / validation_file
            with val_path.open('w') as f:
                f.write(url.read().decode())

    else:
        with urllib.request.urlopen(train_url) as url:
            train_path = data_dir / train_file
            with train_path.open('w') as f:
                f.write(url.read().decode())

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        str(data_dir.absolute()),
        "cache_{}_{}".format(
            "dev" if evaluate else "train",
            model_name,
        ),
    )

    # Init features and dataset from cache if it exists
    overwrite_cache = False  # Set to True to do a cache wipe (TODO: Make cache wipe configurable)
    if os.path.exists(cached_features_file) and not overwrite_cache:
        print("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        if evaluate:
            examples = processor.get_dev_examples(data_dir, filename=validation_file)
        else:
            examples = processor.get_train_examples(data_dir, filename=train_file)
        features, dataset = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                is_training=not evaluate,
                return_dataset="pt",
        )
        print("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
    return dataset, examples, features
示例#6
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            
            #Tydi specific
            if args.leave_out_languages is not None:
                logger.info("Creating temporary trainig file at %s", args.data_dir)
                leave_languages = args.leave_out_languages.split(',')
                with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader:
                    input_data = json.load(reader)
                tmp_data = {}
                tmp_data['data'] = []
                for k in input_data.keys():
                    if k != 'data':
                        tmp_data[k] = input_data[k]
                left_out_count = 0
                for entry in input_data['data']:
                    paragraph = entry["paragraphs"][0]   #only one paragraph per entry
                    qa = paragraph["qas"][0]             #single question is sufficient to determine the language 
                    lang = qa['id'].split('-')[0]
                    if lang not in leave_languages:
                        tmp_data['data'].append(entry)
                    else:
                        left_out_count += 1
                logger.info("No. of training examples left out %d", left_out_count)

                tmp_filename = args.train_file[:-5]
                for lang in leave_languages:
                    tmp_filename += '-'+lang
                tmp_filename += '.json'
                with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer:
                    json.dump(tmp_data, writer)

            if args.train_on_languages is not None:
                logger.info("Creating temporary training file at %s", args.data_dir)
                keep_languages = args.train_on_languages.split(',')
                with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader:
                    input_data = json.load(reader)
                tmp_data = {}
                tmp_data['data'] = []
                for k in input_data.keys():
                    if k != 'data':
                        tmp_data[k] = input_data[k]
                left_out_count = 0
                keep_count = 0
                for entry in input_data['data']:
                    paragraph = entry["paragraphs"][0]   #only one paragraph per entry
                    qa = paragraph["qas"][0]             #single question is sufficient to determine the language 
                    lang = qa['id'].split('-')[0]
                    if lang in keep_languages:
                        tmp_data['data'].append(entry)
                        keep_count += 1
                    else:
                        left_out_count += 1
                logger.info("No. of training examples left out %d", left_out_count)
                logger.info("No. of training examples kept %d", keep_count)

                tmp_filename = args.train_file[:-5]
                for lang in keep_languages:
                    tmp_filename += '-keep-'+lang
                tmp_filename += '.json'
                with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer:
                    json.dump(tmp_data, writer) 

            if evaluate:
                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            else:
                if args.leave_out_languages is not None:
                    args.train_file = tmp_filename
                if args.train_on_languages is not None:
                    args.train_file = tmp_filename
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#7
0
def data_save(args, tokenizer, evaluate=False):

    data_config = DataConfig(
        endpoint="127.0.0.1:9000",
        access_key="minio",
        secret_key="miniosecretkey",
        dataset_name="SQuAD1.1"
        if not args.version_2_with_negative else "SQuAD2.0",
        additional={
            "mode": "train" if not evaluate else "test",
            "framework": "pytorch",
            "version": 1.1 if not args.version_2_with_negative else 2.0,
            "model_name": args.tokenizer_name,
            "doc_stride": args.doc_stride,
            "max_seq_length": args.max_seq_length,
            "max_query_length": args.max_query_length,
        },
        attributes=train_attributes if not evaluate else eval_attributes,
    )

    processor = SquadV2Processor(
    ) if args.version_2_with_negative else SquadV1Processor()
    if evaluate:
        examples = processor.get_dev_examples(None, filename=args.predict_file)
    else:
        examples = processor.get_train_examples(None, filename=args.train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=args.threads,
    )

    data_saver = DataSaver(config=data_config)
    dataloader = DataLoader(dataset, batch_size=8, num_workers=args.threads)

    if not evaluate:
        for batch in tqdm(dataloader):
            inputs = {
                "all_input_ids": batch[0],
                "all_attention_masks": batch[1],
                "all_token_type_ids": batch[2],
                "all_start_positions": batch[3],
                "all_end_positions": batch[4],
                "all_cls_index": batch[5],
                "all_p_mask": batch[6],
                "all_is_impossible": batch[7]
            }
            data_saver(inputs)
    else:
        for batch in tqdm(dataloader):
            inputs = {
                "all_input_ids": batch[0],
                "all_attention_masks": batch[1],
                "all_token_type_ids": batch[2],
                "all_feature_index": batch[3],
                "all_cls_index": batch[4],
                "all_p_mask": batch[5],
            }
            data_saver(inputs)
        _features, _examples = tempfile.mktemp("features"), tempfile.mktemp(
            "examples")
        torch.save(features, _features)
        torch.save(examples, _examples)
        data_saver({
            "features": _features,
            "examples": _examples
        },
                   filetype=True)

    data_saver.disconnect()
示例#8
0
def load_combined_examples(args, evaluate=False):
    """
    Deprecated sadly
    """
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if not args.data_dir and ((evaluate and not args.predict_file) or
                              (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        if args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD.")
        logger.warn("Something went wrong!")
        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir,
                                                  filename=args.predict_file)
            # Sanity check for loading the correct example
            assert examples[
                0].question_text == 'In what country is Normandy located?', 'Invalid dev file!'
        else:
            # Normal get train examples
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=args.train_file)
            # Sanity check for loading the correct example
            assert examples[
                0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!'

    assert args.saved_processed_data_dir, 'args.saved_processed_data_dir not defined!'
    ensemble_dir = args.saved_processed_data_dir

    if evaluate:
        with open(os.path.join(ensemble_dir, 'saved_data_dev.pkl'), 'rb') as f:
            saved_data = pickle.load(f)
    else:
        with open(os.path.join(ensemble_dir, 'saved_data_train.pkl'),
                  'rb') as f:
            saved_data = pickle.load(f)
    # saved_data: [features, all_results, tokenizer]
    features, combined_all_results, tokenizer = saved_data
    assert np.array_equal(
        [f.start_position for f in features[0]],
        [f.start_position
         for f in features[1]]), print("Same family Same features")

    # Same family same feature and tokenizer, so we pick the first one
    features = features[0]
    tokenizer = tokenizer[0]
    all_predict_start_logits = []
    all_predict_end_logits = []
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)

    for all_results in combined_all_results:
        all_predict_start_logits.append(
            torch.tensor([s.start_logits for s in all_results],
                         dtype=torch.float))
        all_predict_end_logits.append(
            torch.tensor([s.end_logits for s in all_results],
                         dtype=torch.float))

    if evaluate:
        all_example_indices = torch.arange(all_input_ids.size(0),
                                           dtype=torch.long)

    all_predict_start_logits = torch.stack(all_predict_start_logits).permute(
        1, 0, 2)
    all_predict_end_logits = torch.stack(all_predict_end_logits).permute(
        1, 0, 2)

    # print(f'all_input_ids: {all_input_ids.shape}, all_predict_start_logits{all_predict_start_logits.shape}, all_predict_end_logits:{all_predict_end_logits.shape}')

    if evaluate:
        dataset = TensorDataset(all_predict_start_logits,
                                all_predict_end_logits, all_example_indices)
    else:
        all_start_positions = torch.tensor(
            [f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features],
                                         dtype=torch.long)
        # print(all_start_positions.shape, all_end_positions.shape)
        dataset = TensorDataset(all_predict_start_logits,
                                all_predict_end_logits, all_start_positions,
                                all_end_positions)
    if evaluate:
        assert len(examples) == 6078
    else:
        assert len(examples) == 130319

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    return examples, features, dataset, tokenizer, len(combined_all_results)
示例#9
0
    def __init__(
        self,
        args,
        tokenizer: AutoTokenizer,
        mode: Optional[str] = "train",
        is_language_sensitive: Optional[bool] = False,
        cache_dir: Optional[str] = None,
        dataset_format: Optional[str] = "pt",
        #  threads: Optional[int] = 1,
        threads: Optional[int] = 8,
        debug: Optional[bool] = False,
    ):

        self.args = args
        self.tokenizer = tokenizer
        self.is_language_sensitive = is_language_sensitive
        self.processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        self.mode = mode
        self.debug = debug
        self.threads = threads

        self.max_seq_length = self.args.max_seq_length
        self.doc_stride = self.args.doc_stride
        self.max_query_length = self.args.max_query_length

        # dataset format configurations
        self.column_names = ["id", "title", "context", "question", "answers"]
        self.question_column_name = "question" if "question" in self.column_names else self.column_names[
            0]
        self.context_column_name = "context" if "context" in self.column_names else self.column_names[
            1]
        self.answer_column_name = "answers" if "answers" in self.column_names else self.column_names[
            2]

        # Padding side determines if we do (question|context) or (context|question).
        self.pad_on_right = tokenizer.padding_side == "right"

        # load data features from cache or dataset file
        version_tag = "v2" if args.version_2_with_negative else "v1"
        # print(args.data_dir)
        # print(tokenizer.__class__.__name__)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(mode, tokenizer.__class__.__name__,
                                        str(args.max_seq_length), version_tag))

        self.cached_data_file = cached_features_file

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            self.old_features = torch.load(cached_features_file)

            # legacy cache files have only features,
            # which new cache files will have dataset and examples also.
            self.features = self.old_features["features"]
            self.dataset = self.old_features.get("dataset", None)
            self.examples = self.old_features.get("examples", None)

            if self.dataset is None or self.examples is None:
                raise ValueError
        else:
            if self.mode == "dev":
                self.examples = self.processor.get_dev_examples(args.data_dir)
            else:
                self.examples = self.processor.get_train_examples(
                    args.data_dir)

            if self.debug:
                print(f"DEBUG INFO -> already load {self.mode} data ...")
                print(f"DEBUG INFO -> show 2 EXAMPLES ...")
                for idx, data_examples in enumerate(self.examples):
                    # data_examples should be an object of transformers.data.processors.squad.SquadExample
                    if idx <= 2:
                        print(f"DEBUG INFO -> {idx}, {data_examples}")
                        print(f"{idx} qas_id -> {data_examples.qas_id}")
                        print(
                            f"{idx} question_text -> {data_examples.question_text}"
                        )
                        print(
                            f"{idx} context_text -> {data_examples.context_text}"
                        )
                        print(
                            f"{idx} answer_text -> {data_examples.answer_text}"
                        )
                        print("-*-" * 10)

            self.features, self.dataset = squad_convert_examples_to_features(
                examples=self.examples,
                tokenizer=tokenizer,
                max_seq_length=self.max_seq_length,
                doc_stride=self.doc_stride,
                max_query_length=self.max_query_length,
                is_training=mode == "train",
                threads=self.threads,
                return_dataset=dataset_format,
            )

            torch.save(
                {
                    "features": self.features,
                    "dataset": self.dataset,
                    "examples": self.examples
                },
                cached_features_file,
            )
def get_evaluation_metrics(
    model, data_dir: str, filename: str, batch_size: int = 32, num_batches: int = None,
) -> Dict[str, "Number"]:
    """
    Return an OrderedDict in the format:
    {
    'exact': 0.8169797018445212,
    'f1': 4.4469722448269335,
    'total': 11873,
    'HasAns_exact': 0.15182186234817813,
    'HasAns_f1': 7.422216845956518,
    'HasAns_total': 5928,
    'NoAns_exact': 1.4802354920100924,
    'NoAns_f1': 1.4802354920100924,
    'NoAns_total': 5945,
    'best_exact': 50.07159100480081,
    'best_exact_thresh': 0.0,
    'best_f1': 50.0772059855695,
    'best_f1_thresh': 0.0
    }
    """
    # These are not used in inference, only for scoring in `compute_predictions_logits()`.
    processor = SquadV2Processor()
    tokenizer = get_tokenizer()
    examples: List[SquadExample] = processor.get_dev_examples(data_dir, filename=filename)
    features: List[SquadFeatures] = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        batch_size=batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=True,
    )

    # Here we get the dataset instead of just the features, with return_raw_features=False.
    dataset: tf.data.Dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        batch_size=batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=False,
    )
    results: List[SquadResult] = get_squad_results(
        model=model,
        dataset=dataset,
        features=features,
        batch_size=batch_size,
        num_batches=num_batches,
    )

    write_prediction_files = False
    if write_prediction_files:
        output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json"
        output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json"
        output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json"
    else:
        output_predictions_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=True,
        output_prediction_file=output_predictions_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
    )

    results: collections.OrderedDict = squad_evaluate(examples, predictions)
    return results
示例#11
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            use_aug_path=False,
                            output_examples=False
                            ) -> torch.utils.data.TensorDataset:
    """Loads SQuAD-like data features from dataset file (or cache)

    Parameters
    ----------
    args : kitanaqa.trainer.arguments.ModelArguments
        A set of arguments related to the model. Specifically, the following arguments are used in this function:
        - args.train_file_path : str
            Path to the training data file
        - args.do_aug : bool
            Flag to specify whether to use the augmented training set. If True, will be merged with the original training set specified in train_file_path. The default value is False.
        - args.aug_file_path : str
            Path for augmented train dataset
        - args.data_dir : str
            Path for data files
        - args.model_name_or_path : str
            Path to pretrained model or model identifier from huggingface.co/models
        - args.max_seq_length : Optional[int]
            Max length for the input tokens, specified to the Transformer model defined in `model_name_or_path`
        - args.overwrite_cache : Bool
            Overwrite cached data on load
        - args.predict_file_path : Dict[str, str]
            Paths for eval datasets, where the key is the data file tag, and the value is the data file path. Multiple file paths may be given for evaluation, and each will be cached and loaded separately.
        - args.version_2_with_negative : Bool
            Flag that specifies to use the SQuAD v2.0 preprocessors. The default value is False.
        - args.doc_stride : Optional[int]
            Corresponds to the doc_stride input param for some Huggingface Transformer models.
        - args.max_query_length : Optional[int]
              Max length for the query segment in the Transformer model input.
    tokenizer : 
        The Transformer model tokenizer used to preprocess the data.
    evaluate : Optional(Bool)
        A flag to set the trainer task to either train or evaluate. The default value is False.
    use_aug_path : Optional(Bool)
        A flag to define whether to use the aug_file_path or the train_file_path. If True, the augmented data path is used when loading and caching the data.
    output_examples : Optional(Bool)
        A flag to define whether the examples and features should be returned by the data preprocessor. If False, the preprocessor only returns the dataset. This is necessary if the Trainer is used for evaluation or in a pipeline where training is followed by evaluation.

    Returns
    -------
    torch.utils.data.TensorDataset
        The dataset containing the data to be used for training or evaluation.
        Important Notes:
        - If the output_examples is True, examples and features also are returned.
        - If evaluate = True, the output will be a dictionary for which the keys are the name of the datasets used for evaluation and the values are the dataset (and optionally the examples and features).
        
    """

    if not args.train_file_path and not (args.do_aug and args.aug_file_path):
        logging.error(
            'load_and_cache_examples requires one of either \"train_file_path\", \"aug_file_path\"'
        )

    # Use the augmented data or the original training data
    train_or_aug_path = args.train_file_path if not use_aug_path else args.aug_file_path

    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file_path) or
                                  (not evaluate and not train_or_aug_path)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            if evaluate:
                # when does it concatenate if eval and train are both true?
                examples = {}
                processor = AlumSquadV2Processor(
                ) if args.version_2_with_negative else AlumSquadV1Processor()
                for predict_sets, predict_paths in args.predict_file_path.items(
                ):
                    examples[predict_sets] = processor.alum_get_dev_examples(
                        args.data_dir, filename=predict_paths)
                    logger.info("Evaluation Data is fetched for %s.",
                                predict_sets)
            else:
                processor = SquadV2Processor(
                ) if args.version_2_with_negative else SquadV1Processor()
                examples = processor.get_train_examples(
                    args.data_dir, filename=train_or_aug_path)

        if not evaluate:
            features, dataset = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=not evaluate,
                return_dataset="pt",
                #threads=args.threads,
            )

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

        else:
            #TODO: Incremental Cache - The current version will cache all the eval files together.
            features, dataset = {}, {}
            for predict_sets, example in examples.items():
                features[predict_sets], dataset[
                    predict_sets] = alum_squad_convert_examples_to_features(
                        examples=example,
                        tokenizer=tokenizer,
                        max_seq_length=args.max_seq_length,
                        doc_stride=args.doc_stride,
                        max_query_length=args.max_query_length,
                        return_dataset="pt",
                        #threads=args.threads,
                    )
                logger.info(
                    "Feature Extraction for Evaluation Data from %s is Finished.",
                    predict_sets)
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
def run_prediction(model, question_texts, context_text):
    """Setup function to compute predictions"""

    processor = SquadV2Processor()
    config = model.model.config
    tokenizer = model.tokenizer
    examples = []

    model = model.model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    if not os.path.exists("predictions"):
        os.mkdir("predictions")

    output_prediction_file = "predictions/predictions.json"
    output_nbest_file = "predictions/nbest_predictions.json"
    output_null_log_odds_file = "predictions/null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions
示例#13
0
def run_prediction(question_texts, context_text, model_path):
    ### Setting hyperparameters
    max_seq_length = 512
    doc_stride = 256
    n_best_size = 1
    max_query_length = 64
    max_answer_length = 512
    do_lower_case = False
    null_score_diff_threshold = 0.0

    # model_name_or_path = "../cuad-models/roberta-base/"

    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    config_class, model_class, tokenizer_class = (
        AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
    config = config_class.from_pretrained(model_path)
    tokenizer = tokenizer_class.from_pretrained(
        model_path, do_lower_case=True, use_fast=False)
    model = model_class.from_pretrained(model_path, config=config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    processor = SquadV2Processor()
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs.to_tuple()]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    final_predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=all_results,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        do_lower_case=do_lower_case,
        output_prediction_file=None,
        output_nbest_file=None,
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=null_score_diff_threshold,
        tokenizer=tokenizer
    )

    return final_predictions
def load_and_cache_examples(args,
                            tokenizer,
                            prefix,
                            evaluate=False,
                            output_examples=False,
                            gpt=False):
    """
    Loads the training file from the SQuAD2.0 dataset and splits it into 90% : 10% train : test splits.
    It caches and return the dataset and features for each split.
    """
    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file_train = os.path.join(
        input_dir,
        "cached_{}_{}_{}_train".format(
            prefix,
            list(filter(None, args.model_name.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    cached_features_file_dev = os.path.join(
        input_dir,
        "cached_{}_{}_{}_dev".format(
            prefix,
            list(filter(None, args.model_name.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if (os.path.exists(cached_features_file_train) and not args.overwrite_cache
            and not evaluate):
        logger.info("Loading features from cached file: {}".format(
            cached_features_file_train))
        features_and_dataset = torch.load(cached_features_file_train)
        train_features, train_ds, train_examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    elif (os.path.exists(cached_features_file_dev) and not args.overwrite_cache
          and evaluate):
        logger.info("Loading features from cached file: {}".format(
            cached_features_file_dev))
        features_and_dataset = torch.load(cached_features_file_dev)
        dev_features, dev_ds, dev_examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )

    # Preprocess examples into features if not already in cache
    else:
        logger.info("Creating features from dataset file at %s", input_dir)
        if not args.data_dir and not args.train_file:
            raise ImportError(
                "Please specify --data_dir or {}".format('--train_file'))
        else:
            processor = SquadV2Processor()
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=args.train_file)

        train_examples = examples[:len(examples) * 9 // 10]
        dev_examples = examples[len(examples) * 9 // 10:]

        train_features = squad_convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True,
            threads=args.threads,
            gpt=gpt)
        train_ds = feats_to_ds(train_features)

        logger.info("Saving train features into cached file %s",
                    cached_features_file_train)
        torch.save(
            {
                "features": train_features,
                "dataset": train_ds,
                "examples": train_examples
            }, cached_features_file_train)

        dev_features = squad_convert_examples_to_features(
            examples=dev_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True,
            threads=args.threads,
            gpt=gpt)
        dev_ds = feats_to_ds(dev_features)

        logger.info("Saving dev features into cached file %s",
                    cached_features_file_dev)
        torch.save(
            {
                "features": dev_features,
                "dataset": dev_ds,
                "examples": dev_examples
            }, cached_features_file_dev)

    if evaluate:
        if output_examples:
            return dev_ds, dev_examples, dev_features
        else:
            return dev_ds
    else:
        if output_examples:
            return train_ds, train_examples, train_features
        else:
            return train_ds
示例#15
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
示例#16
0
                                       reporthook=t.update_to)
    else:
        # Simple download with no progress bar
        urllib.request.urlretrieve(url, output_path)


for d in ('train', 'dev', 'test'):
    url = 'https://github.com/chrischute/squad/data/{}-v2.0.json'.format(d)
    output_path = url_to_data_path(url)
    if not os.path.exists(output_path):
        print(f'Downloading {d}...')
        download_url(url, output_path)

print("Creating features from dataset file at {}".format(input_dir))

processor = SquadV2Processor(
) if version_2_with_negative else SquadV1Processor()
for d in ('train', 'dev', 'test'):
    evaluate = d != 'train'
    cached_features_file = '.data/cached_{}'.format(d)
    examples = processor.get_dev_examples(data_dir,
                                          filename='{}-v2.0.json'.format(d))

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=threads,
from transformers import (WEIGHTS_NAME, BertConfig, BertForQuestionAnswering, BertTokenizer)
from torch.utils.data import (DataLoader, SequentialSampler)

# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)
config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir)
model = model_class.from_pretrained(model_name_or_path,
                                    from_tf=False,
                                    config=config,
                                    cache_dir=cache_dir)
# load_and_cache_examples
from transformers.data.processors.squad import SquadV2Processor

processor = SquadV2Processor()
examples = processor.get_dev_examples(None, filename=predict_file)

from transformers import squad_convert_examples_to_features
features, dataset = squad_convert_examples_to_features(
            examples=examples,
	                tokenizer=tokenizer,
			            max_seq_length=max_seq_length,
				                doc_stride=doc_stride,
						            max_query_length=max_query_length,
							                is_training=False,
									            return_dataset='pt'
										            )

cached_features_file = os.path.join(cache_dir, 'cached_{}_{}_{}'.format(
        'dev',
示例#18
0
def run_squad_and_get_results(
    model: tf.keras.Model,  # Must be QuestionAnswering model, not PreTraining
    tokenizer: PreTrainedTokenizer,
    run_name: str,
    filesystem_prefix: str,
    per_gpu_batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    evaluate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000
    evaluate_frequency = evaluate_frequency or 1000000
    is_sagemaker = filesystem_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker

    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic"
    )  # AMP

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = os.path.join(filesystem_prefix, "squad_data")

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        logger.info(f"Starting finetuning on {dataset}")
        pbar = tqdm.tqdm(total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            per_gpu_batch_size=per_gpu_batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    global train_step
    train_step = rewrap_tf_function(train_step)

    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = train_step(
            model=model, optimizer=optimizer, batch=batch
        )

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = ((step > 0) and step % checkpoint_frequency == 0) or is_final_step
            do_validate = ((step > 0) and step % validate_frequency == 0) or is_final_step
            do_evaluate = ((step > 0) and step % evaluate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                logger.info("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}"
                )
                logger.info(description)

            if do_evaluate:
                logger.info("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model,
                        tokenizer=tokenizer,
                        data_dir=data_dir,
                        filename=val_filename,
                        per_gpu_batch_size=32,
                    )
                print_eval_metrics(results=results, step=step, dataset=dataset)

            if do_checkpoint:
                # TODO: Abstract out to specify any checkpoint path
                checkpoint_path = os.path.join(
                    filesystem_prefix, f"checkpoints/squad/{run_name}-step{step}.ckpt"
                )
                logger.info(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                # TODO: Abstract out to specify any logs path
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(filesystem_prefix, f"logs/squad/{run_name}")
                )
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision", val_precision, step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(
                        summary_writer=summary_writer, results=results, step=step, dataset=dataset
                    )

        if is_final_step:
            break
    del train_dataset

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        logger.info(f"Finished finetuning, job name {run_name}")
        return results
示例#19
0
import json
import torch

from distilbert_squad import DISTILBERT_SQUAD
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadResult, SquadV2Processor
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer

device = torch.device('cuda')
logger = SummaryWriter('logs/distilbert_model')

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
feature_processor = SquadV2Processor()
examples = feature_processor.get_train_examples('../data')

features, dataset = squad_convert_examples_to_features(examples=examples,
                                                       tokenizer=tokenizer,
                                                       max_seq_length=512,
                                                       doc_stride=128,
                                                       max_query_length=128,
                                                       is_training=True,
                                                       return_dataset="pt",
                                                       threads=1)

train_loader = DataLoader(dataset=dataset, batch_size=6, shuffle=True)
dev_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
dbs = DISTILBERT_SQUAD().to(device)
num_epochs = 2
optimizer = torch.optim.Adam(dbs.parameters(), lr=.00003)
示例#20
0
def train(model_prefix, model_dir, data_dir, data_file, epochs, layers,
          batch_size, hidden_dim, max_seq_length, device):

    # Extract examples
    tokenizer = AutoTokenizer.from_pretrained(model_prefix)
    processor = SquadV2Processor()
    train_examples = processor.get_train_examples(data_dir=data_dir,
                                                  filename=data_file)

    # Extract train features
    print("Loading train features")
    train_features, train_dataset = squad_convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=128,
        max_query_length=64,
        is_training=True,
        return_dataset="pt",
        threads=1,
    )

    # Initialize model
    config = AutoConfig.from_pretrained(model_prefix,
                                        output_hidden_states=True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_prefix,
                                                          config=config)

    # multi-gpu evaluate
    model = torch.nn.DataParallel(model)

    # Initialize probes
    print("Initializing probes")
    probes = []
    for i in range(layers):
        p = Probe(hidden_dim)
        probes.append(p)

    # Training epochs
    for epoch in range(epochs):

        print("Training epoch: {}".format(epoch + 1))

        # Initialize train data loader
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=batch_size)

        # Training batches
        for batch in tqdm(train_dataloader, desc="Iteration"):

            # Get batch on the right device and prepare input dict
            batch = tuple(t.to(device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            # Distil does not use token type ids
            if "distil" in model_dir:
                inputs.pop('token_type_ids')

            # ALBERT/BERT/Distilibert forward pass
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)

            # Extract hiddent states
            all_layer_hidden_states = outputs[3][
                1:]  # (layers, batch_size, max_seq_len, hidden_size)

            # Get labels, and update probes for batch
            start = batch[3]  # (batch_size)
            end = batch[4]  # (batch_size)

            for i, p in enumerate(probes):
                hiddens = all_layer_hidden_states[
                    i]  # (batch_size, max_seq_len, hidden_size)
                p.train(hiddens, start, end, device)

        # Save probes after each epoch
        print("Epoch complete, saving probes")
        epoch_dir = model_dir + "/epoch_" + str(epoch + 1)
        if not os.path.exists(epoch_dir):
            os.mkdir(epoch_dir)

        probes_dir = epoch_dir + "/probes"
        if not os.path.exists(probes_dir):
            os.mkdir(probes_dir)

        # Save probes for each layer, both start and end index
        for i, p in enumerate(probes):
            p.save(probes_dir, i + 1)