Exemplo n.º 1
0
def dataset():
    """
    Check if the data in two instances is different
    """
    args = "--config demo_tiny_128".split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)
    loader = TFRecordPretrainingDataset(config.input_files)
    loader = get_dataloader(config, opts)

    # Save part of the data as list
    loader_list = list(loader)[0][0][0].numpy()

    # MPI to broadcast data in root=1 to root=0
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    loader_list_copy = np.copy(loader_list)
    comm.Bcast(loader_list, root=1)

    # Assert if data broadcast to root=0 is different
    if comm.Get_rank() == 0 and not np.all(loader_list_copy == loader_list):
        print('Passed test: instances have different data')

    # Wait until both roots are finished
    time.sleep(2)
Exemplo n.º 2
0
def test_wikipedia_dataset():
    args = "--config demo_tiny_128".split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    config.vocab_size = 30522
    config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"]

    num_tokens = 0
    replacement_counts = Counter({"103": 0, "same": 0, "random": 0})

    opts = get_options(config)
    loader = get_dataloader(config, opts)

    for datum in tqdm(loader):
        tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum
        tokens = tokens.numpy()
        attn_mask = attn_mask.numpy()
        types = types.numpy()
        mask_lm_pos = mask_lm_pos.numpy()
        labels = labels.numpy()
        nsp = nsp.numpy()
        for b in range(config.micro_batch_size):
            check_dimensions(config, tokens[b], attn_mask[b], types[b],
                             mask_lm_pos[b], labels[b], nsp[b])
            check_tokens(config, tokens[b], mask_lm_pos[b], labels[b])
            check_attention_mask(attn_mask[b], tokens[b])
            check_mask_lm_positions(config, mask_lm_pos[b])
            check_labels(config, tokens[b], mask_lm_pos[b], labels[b])
            check_token_type(types[b])
            check_nsp(nsp[b])

            replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b],
                                                  labels[b])

            # Number of tokens, not including padding
            num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0]

    # Test masked token proportions
    total = sum(replacement_counts.values())
    for k in replacement_counts:
        replacement_counts[k] /= total

    assert (0.79 < replacement_counts["103"] < 0.81)
    assert (0.09 < replacement_counts["same"] < 0.11)
    assert (0.09 < replacement_counts["random"] < 0.11)
    assert (0.14 < total / num_tokens < 0.16)  # should be ~0.15
Exemplo n.º 3
0
def test_recompute_checkpoint_not_in_ir():
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --weight-decay 0.0
    --recompute-checkpoint-every-layer False
    """.split()
    config = BertConfig(**(vars(parse_bert_args(args))))

    assert config.recompute_checkpoint_every_layer is False

    # Execution parameters
    opts = get_options(config)
    model = PipelinedBertForPretraining(config).parallelize().half().train()
    optimizer = get_optimizer(config, model)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    # Compile model
    datum = get_generated_datum(config)
    poptorch_model.compile(*datum)
    ir = json.loads(poptorch_model._debugGetPopartIR())
    assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"]
                    ]), ("Popart IR should contain a checkpoint")

    # Stash: 5 inputs, and 1 stash for transformers on ipu1
    exp_num_stash = 5 + 1
    assert sum([
        "Stash" in node["type"] for node in ir["maingraph"]
    ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) "
                          "should be stashed")
    print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
Exemplo n.º 4
0
def main():
    config = transformers.BertConfig(**(vars(parse_bert_args())))
    if not config.pretrained_checkpoint:
        logger(
            "[warning] --pretrained-checkpoint was not specified; training with uninitialized BERT..."
        )
    # Warnings for configs where embeddings may not fit
    if config.embedding_serialization_factor == 1:
        if config.replication_factor == 1:
            logger(
                "[warning] With replication_factor == 1 you may need to set "
                "embedding_serialization_factor > 1 for the model to fit")
        elif not config.replicated_tensor_sharding:
            logger(
                "[warning] With replicated_tensor_sharding=False you may need to set "
                "embedding_serialization_factor > 1 for the model to fit")
    samples_per_step = config.batches_per_step * config.micro_batch_size * \
        config.gradient_accumulation * config.replication_factor
    do_training = config.squad_do_training
    do_validation = config.squad_do_validation
    opts = get_options(config)
    opts.outputMode(poptorch.OutputMode.All)

    logger("Loading Dataset...")
    datasets = load_dataset("squad")
    train_dataset = datasets["train"]

    # Create train features from dataset
    logger("Tokenizing Train Dataset...")
    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=1,
        remove_columns=train_dataset.column_names,
        load_from_cache_file=True,
    )

    # Create validation features from dataset
    logger("Tokenizing Validation Dataset...")
    validation_features = datasets["validation"].map(
        prepare_validation_features,
        batched=True,
        num_proc=1,
        remove_columns=datasets["validation"].column_names,
        load_from_cache_file=True,
    )

    # W&B
    if config.wandb and (not config.use_popdist or config.popdist_rank == 0):
        wandb.init(project="torch-bert",
                   settings=wandb.Settings(console="wrap"))
        wandb_config = vars(config)
        wandb_config['sdk_version'] = get_sdk_version()
        wandb.config.update(wandb_config)

    # Create the model
    if config.pretrained_checkpoint:
        model_ipu = PipelinedBertForQuestionAnswering.from_pretrained(
            config.pretrained_checkpoint, config=config).parallelize().half()
    else:
        model_ipu = PipelinedBertForQuestionAnswering(
            config).parallelize().half()

    if do_training:
        train_dl = poptorch.DataLoader(
            opts,
            train_dataset,
            batch_size=config.micro_batch_size,
            shuffle=True,
            drop_last=False,
            collate_fn=PadCollate(
                samples_per_step, {
                    "input_ids": 0,
                    "attention_mask": 0,
                    "token_type_ids": 0,
                    "start_positions": config.sequence_length,
                    "end_positions": config.sequence_length
                }))
        optimizer = get_optimizer(config, model_ipu)
        model_ipu.train()
        training_model = poptorch.trainingModel(model_ipu, opts, optimizer)

        sample_batch = next(iter(train_dl))
        logger("Compiling Model...")
        start_compile = time.perf_counter()
        training_model.compile(sample_batch["input_ids"],
                               sample_batch["attention_mask"],
                               sample_batch["token_type_ids"],
                               sample_batch["start_positions"],
                               sample_batch["end_positions"])

        duration_compilation = time.perf_counter() - start_compile
        logger(f"Compiled/Loaded model in {duration_compilation} secs")

        if config.compile_only:
            sys.exit()

        # Train
        scheduler = get_lr_scheduler(optimizer, "linear", config.lr_warmup,
                                     config.num_epochs * len(train_dl))
        logger("Training...")
        for epoch in range(config.num_epochs):
            for step, batch in enumerate(train_dl):
                start_step = time.perf_counter()
                outputs = training_model(batch["input_ids"],
                                         batch["attention_mask"],
                                         batch["token_type_ids"],
                                         batch["start_positions"],
                                         batch["end_positions"])

                scheduler.step()
                training_model.setOptimizer(optimizer)
                step_length = time.perf_counter() - start_step
                step_throughput = samples_per_step / step_length
                loss = outputs[0].mean().item()
                logger(
                    f"Epoch: {epoch}, Step:{step}, LR={scheduler.get_last_lr()[0]:.2e}, loss={loss:3.3f}, throughput={step_throughput:3.3f} samples/s"
                )

                if config.wandb:
                    wandb.log({
                        "Loss": loss,
                        "LR": scheduler.get_last_lr()[0],
                        "Step": step,
                        "Throughput": step_throughput
                    })
        training_model.detachFromDevice()

    if do_validation:
        config.micro_batch_size = 2
        config.batches_per_step = 16
        config.gradient_accumulation = 1
        config.replication_factor = 1
        samples_per_step = config.batches_per_step * config.micro_batch_size * \
            config.gradient_accumulation * config.replication_factor
        opts = get_options(config)
        opts.outputMode(poptorch.OutputMode.All)
        val_dl = poptorch.DataLoader(opts,
                                     validation_features.remove_columns(
                                         ['example_id', 'offset_mapping']),
                                     batch_size=config.micro_batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     collate_fn=default_data_collator)
        raw_predictions = [[], []]
        model_ipu.eval()
        inference_model = poptorch.inferenceModel(model_ipu, opts)
        sample_batch = next(iter(val_dl))
        logger("Compiling Inference Model...")
        inference_model.compile(sample_batch["input_ids"],
                                sample_batch["attention_mask"],
                                sample_batch["token_type_ids"])

        if config.compile_only:
            sys.exit()

        logger("Validating...")
        for step, batch in enumerate(val_dl):
            start_step = time.perf_counter()
            outputs = inference_model(batch["input_ids"],
                                      batch["attention_mask"],
                                      batch["token_type_ids"])
            step_length = time.perf_counter() - start_step
            step_throughput = samples_per_step / step_length
            raw_predictions[0].append(outputs[0])
            raw_predictions[1].append(outputs[1])
            logger(f"Step:{step}, throughput={step_throughput} samples/s")

        raw_predictions[0] = torch.vstack(raw_predictions[0]).float().numpy()
        raw_predictions[1] = torch.vstack(raw_predictions[1]).float().numpy()
        final_predictions = postprocess_qa_predictions(datasets["validation"],
                                                       validation_features,
                                                       raw_predictions)
        metric = load_metric("squad")
        formatted_predictions = [{
            "id": k,
            "prediction_text": v
        } for k, v in final_predictions.items()]
        references = [{
            "id": ex["id"],
            "answers": ex["answers"]
        } for ex in datasets["validation"]]
        metrics = metric.compute(predictions=formatted_predictions,
                                 references=references)
        logger(metrics)
        if config.wandb:
            for k, v in metrics.items():
                wandb.run.summary[k] = v
Exemplo n.º 5
0
from args import parse_args
from datasets import dataset
from ipu_options import get_options
from log import logger
from metrics import accuracy
from model import PipelinedViTForImageClassification

if __name__ == "__main__":
    # Validation loop
    # Build config from args
    config = transformers.ViTConfig(**vars(parse_args()))
    logger.info(f"Running config: {config.config}")

    # Execution parameters
    opts = get_options(config)

    test_loader = dataset.get_data(config,
                                   opts,
                                   train=False,
                                   async_dataloader=True)

    # Init from a checkpoint
    model = PipelinedViTForImageClassification.from_pretrained(
        config.pretrained_checkpoint,
        config=config).parallelize().half().train()
    if config.precision.startswith("16."):
        model.half()

    # Execution parameters
    valid_opts = poptorch.Options()