示例#1
0
def test_tracker_goal_times(mocker):
    patched = mocker.patch('mlbench_core.utils.tracker.LogMetrics')

    metric = TopKAccuracy(1)
    tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal)

    tracker.start()

    assert tracker.start_time is not None

    tracker.train()

    with freeze_time(datetime.datetime.now()) as frozen:
        tracker.batch_start()
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('init')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('fwd_pass')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('comp_loss')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('backprop')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('opt_step')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.batch_end()

        assert abs(tracker.get_total_communication_time() - 0.5) < 0.01
        assert abs(tracker.get_total_compute_time() - 1.5) < 0.01

        tracker.batch_start()
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('init')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('fwd_pass')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('comp_loss')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('backprop')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.record_batch_step('opt_step')
        frozen.tick(delta=datetime.timedelta(seconds=0.5))
        tracker.batch_end()

        assert abs(tracker.get_total_communication_time() - 1.0) < 0.01
        assert abs(tracker.get_total_compute_time() - 3.0) < 0.01

        tracker.validation()
        tracker.record_stat('global_Prec@1', 70, log_to_api=True)

        assert tracker.goal_reached
        assert any(filter(lambda c: c[1][3] == 'TaskResult',
                          patched.method_calls))
示例#2
0
def test_tracker_goal(mocker):
    patched = mocker.patch('mlbench_core.utils.tracker.LogMetrics')

    metric = TopKAccuracy(1)
    tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal)

    tracker.start()

    assert tracker.start_time is not None

    tracker.train()

    tracker.record_stat('global_Prec@1', 69, log_to_api=True)
    tracker.batch_end()

    assert not tracker.goal_reached

    tracker.record_stat('global_Prec@1', 70, log_to_api=True)
    tracker.batch_end()


    assert not tracker.goal_reached

    tracker.validation()

    tracker.record_stat('global_Prec@1', 69, log_to_api=True)
    tracker.batch_end()

    assert not tracker.goal_reached

    tracker.record_stat('global_Prec@1', 70, log_to_api=True)

    assert tracker.goal_reached
示例#3
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Train loop"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    train_epochs = 8
    train_min_len, train_max_len = 0, 75
    val_min_len, val_max_len = 0, 150
    math_mode = "fp16"  # One of `fp16`, `fp32`
    lang = ("en", "de")

    # Training
    train_global_batch_size = 2048  # Global batch size
    max_bs = 128  # Max batch size for used hardware
    update_freq = int(max(1, train_global_batch_size // (max_bs * world_size)))
    train_batch_size = int(train_global_batch_size // (world_size * update_freq))
    val_batch_size = 64

    # Model attributes
    model_args = {
        "hidden_size": 1024,
        "num_layers": 4,
        "dropout": 0.2,
        "share_embedding": True,
        "fusion": True,
    }

    # Criterion
    criterion_args = {"smoothing": 0.1, "fast_xentropy": True}

    # Loss scaling
    loss_scaling = {"init_scale": 1024, "upscale_interval": 128}

    # Optimizer
    optimizer_args = {
        "lr": 2e-3,
        "grad_clip": 5.0,
    }

    # Scheduler
    scheduler_args = {
        "warmup_steps": 200,
        "remain_steps": 0.4,
        "decay_interval": 0.05,
        "decay_steps": 4,
        "decay_factor": 0.5,
    }

    # Translator
    translator_args = {
        "beam_size": 5,
        "len_norm_factor": 0.6,
        "cov_penalty_factor": 0.1,
        "len_norm_const": 5.0,
        "max_seq_len": 150,
    }

    # Build train/val datsets
    train_set = WMT16Dataset(
        dataset_dir,
        math_precision=math_mode,
        lang=lang,
        train=True,
        download=True,
        preprocessed=True,
        min_len=train_min_len,
        max_len=train_max_len,
    )
    train_set.prepare()
    val_set = WMT16Dataset(
        dataset_dir,
        math_precision=math_mode,
        lang=lang,
        validation=True,
        download=False,
        min_len=val_min_len,
        max_len=val_max_len,
        sort=True,
    )

    tokenizer = train_set.tokenizer

    # Build model
    model = GNMT(vocab_size=train_set.vocab_size, **model_args)

    # Build loss function
    criterion = LabelSmoothing(padding_idx=wmt16_config.PAD, **criterion_args)

    # Bilingual Evaluation Understudy Score
    metrics = [BLEUScore()]

    # Partition data
    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    collate_fn = build_collate_fn(sort=True)
    train_loader = DataLoader(
        train_set,
        batch_size=train_batch_size,
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True,
        drop_last=False,
        shuffle=True,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=val_batch_size,
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True,
        drop_last=False,
    )

    validate_every = update_freq * round(
        len(train_loader) * 0.30 / update_freq
    )  # Validate every 30%

    # Build optimizer & scheduler
    total_train_iters = (len(train_loader) // update_freq) * train_epochs

    print("Number of batches per epoch {}".format(len(train_loader)))
    print("Train iterations per epoch {}".format(total_train_iters / train_epochs))

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    use_horovod = math_mode == "fp16" and dist.get_backend() == dist.Backend.MPI

    if use_horovod:
        hvd.init()
        logger.info("Using horovod rank={}".format(hvd.rank()))
        tensor = torch.tensor([1])
        res = hvd.allreduce(tensor, op=hvd.Sum)
        assert res[0] == world_size

    fp_optimizer, optimizer, model = build_optimizer(
        model=model,
        math=math_mode,
        loss_scaling=loss_scaling,
        use_cuda=use_cuda,
        use_horovod=use_horovod,
        **optimizer_args
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = ExponentialWarmupMultiStepLR(
        optimizer, total_train_iters, **scheduler_args
    )

    # Translator
    translator = Translator(model=model, trg_tokenizer=tokenizer, **translator_args)

    checkpointer = Checkpointer(
        ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST
    )

    if not validation_only:

        if light_target:
            goal = task4_time_to_bleu_goal(20)
        else:
            goal = task4_time_to_bleu_goal(24)

        num_batches_per_device_train = len(train_loader)
        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()
        tracker.start()

        for epoch in range(0, train_epochs):
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            model.train()
            tracker.train()
            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(data, target, use_cuda=use_cuda)
                tracker.record_batch_load()

                is_last = batch_idx == len(train_loader)
                update = (batch_idx % update_freq) == update_freq - 1
                init = (batch_idx % update_freq) == 0

                # Clear gradients in the optimizer.
                if init:
                    fp_optimizer.zero_grad()
                    tracker.record_batch_init()

                # Compute the output
                output = compute_model_output(model, data, target)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss, loss_per_token = compute_loss(
                    data, target, output, criterion, update_freq
                )
                tracker.record_batch_comp_loss()
                # Backprop
                fp_optimizer.backward_loss(loss)
                tracker.record_batch_backprop()

                # Opt step
                if update or is_last:
                    # For this task, simply sum all gradients
                    updated = fp_optimizer.step(tracker=tracker, denom=1)

                    # Learning rate scheduler
                    if updated:
                        scheduler.step()

                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx=batch_idx,
                    loss=loss_per_token,
                    output=target[0],  # Use target just for the size
                    metric_results={},
                    tracker=tracker,
                    num_batches_per_device_train=num_batches_per_device_train,
                )

                # Validation during training
                if (batch_idx + 1) % validate_every == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    metrics_values, loss = validation_round(
                        val_loader,
                        metrics,
                        model,
                        criterion,
                        update_freq,
                        translator,
                        tracker=tracker,
                        use_cuda=use_cuda,
                    )

                    record_validation_stats(metrics_values, loss, tracker, rank)
                    if tracker.goal_reached:
                        break

                    model.train()
                    tracker.train()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            metrics_values, loss = validation_round(
                val_loader,
                metrics,
                model,
                criterion,
                update_freq,
                translator,
                use_cuda=use_cuda,
            )

            is_best = record_validation_stats(metrics_values, loss, tracker, rank)

            checkpointer.save(
                tracker,
                model,
                fp_optimizer.optimizer,
                scheduler,
                tracker.current_epoch,
                is_best,
            )

            tracker.epoch_end()

            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=criterion,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
示例#4
0
def run(rank, size, run_id):
    """ Distributed Synchronous SGD Example """
    torch.manual_seed(1234)
    logging.info("Loading Dataset")
    train_set, bsz = partition_dataset_train()
    val_set, bsz_val = partition_dataset_val()
    logging.info("Setting up models and training")
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    metrics = [
        TopKAccuracy(topk=1),
        TopKAccuracy(topk=5)
    ]
    loss_func = nn.NLLLoss()

    goal = task1_time_to_accuracy_goal()

    tracker = Tracker(metrics, run_id, rank, goal=goal)

    num_batches = ceil(len(train_set.dataset) / float(bsz))
    num_batches_val = ceil(len(val_set.dataset) / float(bsz_val))

    tracker.start()

    logging.info("Starting train loop")

    for epoch in range(10):
        tracker.train()

        epoch_loss = 0.0
        for i, (data, target) in enumerate(train_set):
            tracker.batch_start()

            optimizer.zero_grad()
            output = model(data)

            tracker.record_batch_step('forward')

            loss = loss_func(output, target)
            epoch_loss += loss.data.item()

            tracker.record_batch_step('loss')

            loss.backward()

            tracker.record_batch_step('backward')

            average_gradients(model)
            optimizer.step()

            tracker.batch_end()

            logging.info("Batch: {}, Loss: {}".format(i, loss.item()))

        tracker.record_loss(epoch_loss, num_batches, log_to_api=True)

        logging.debug('Rank %s, epoch %s: %s',
                      dist.get_rank(), epoch,
                      epoch_loss / num_batches)

        metrics, loss = validation_round(val_set, model, loss_func, metrics, "fp32",
                                         tracker=tracker, transform_target_type=False,
                                         use_cuda=False, max_batches=num_batches_val)
        record_validation_stats(metrics, loss, tracker=tracker, rank=rank)

        tracker.epoch_end()

        if tracker.goal_reached:
            logging.debug("Goal Reached!")
            return
示例#5
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
    seed=42,
):
    """Train loop"""
    train_epochs = 750

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # Using batch scaling
    train_batch_size = 80
    train_global_batch_size = train_batch_size * world_size

    val_batch_size = 10
    # Define the batch sizes here

    # Dataset arguments
    bptt = 70
    min_seq_len = 5

    # Model Arguments
    model_args = {
        "ninp": 400,
        "nhid": 1150,
        "nlayers": 3,
        "dropout": 0.4,
        "dropouth": 0.2,
        "dropouti": 0.65,
        "dropoute": 0.1,
        "wdrop": 0.5,
        "tie_weights": True,
    }

    # Optimizer args
    lr = 30
    scaled_lr = lr * math.sqrt(world_size)
    warmup_epochs = 5 * world_size
    weight_decay = 1.2e-6
    grad_clip = 0.25
    alpha = 2
    beta = 1
    nonmono = 5

    # Load train/valid
    train_set = Wikitext2Dataset(dataset_dir,
                                 bptt=bptt,
                                 train=True,
                                 min_seq_len=min_seq_len)
    val_set = Wikitext2Dataset(dataset_dir,
                               bptt=bptt,
                               valid=True,
                               min_seq_len=min_seq_len)
    ntokens = len(train_set.dictionary)

    # Generate batches
    train_set.generate_batches(global_bsz=train_global_batch_size,
                               worker_bsz=train_batch_size,
                               rank=rank)

    val_set.generate_batches(val_batch_size)
    val_set.generate_sequence_lengths()

    logger.info("Built dictionary of {} tokens".format(ntokens))

    model = LSTMLanguageModel(ntokens, **model_args)
    criterion = CrossEntropyLoss(reduction="mean")
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    optimizer = SGD(model.parameters(),
                    lr=scaled_lr,
                    weight_decay=weight_decay)
    c_optimizer = CustomCentralizedOptimizer(
        model=model,
        optimizer=optimizer,
        use_cuda=use_cuda,
        agg_grad=True,
        grad_clip=grad_clip,
        world_size=world_size,
    )

    scheduler = LRLinearWarmUp(
        optimizer,
        init_lr=lr / world_size,
        scaled_lr=scaled_lr,
        warmup_duration=warmup_epochs,
    )
    metrics = [Perplexity()]

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if light_target:
        goal = task3_time_to_perplexity_goal(90)
    else:
        goal = task3_time_to_perplexity_goal(70)

    tracker = Tracker(metrics, run_id, rank, goal=goal, minimize=True)

    dist.barrier()
    tracker.start()

    val_losses = []
    for epoch in range(0, train_epochs):
        model.train()
        tracker.train()

        # Init hidden state
        hidden = model.init_hidden(train_batch_size)

        # Set random sequence lengths for epoch
        set_sequence_lengths(train_set, random=True)

        num_batches_per_device_train = train_set.num_batches()

        for batch_idx in range(num_batches_per_device_train):
            tracker.batch_start()
            data, targets = train_set.get_batch(batch_idx, cuda=use_cuda)

            seq_len = data.size(0)
            lr_original = optimizer.param_groups[0]["lr"]
            batch_lr = lr_original * seq_len / bptt
            optimizer.param_groups[0]["lr"] = batch_lr

            hidden = repackage_hidden(hidden)
            c_optimizer.zero_grad()
            tracker.record_batch_init()

            output, hidden, raw_outputs, outputs = model(data,
                                                         hidden,
                                                         return_h=True)
            tracker.record_batch_fwd_pass()

            loss = criterion(output, targets)
            # Activation regularization
            loss = loss + sum(alpha * dropped_rnn_h.pow(2).mean()
                              for dropped_rnn_h in outputs[-1:])
            # Temporal Activation Regularization (slowness)
            loss = loss + sum(beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                              for rnn_h in raw_outputs[-1:])
            tracker.record_batch_comp_loss()

            loss.backward()
            tracker.record_batch_backprop()

            c_optimizer.step(tracker=tracker)

            optimizer.param_groups[0]["lr"] = lr_original
            metrics_results = compute_train_batch_metrics(
                output,
                targets,
                metrics,
            )

            tracker.record_batch_comp_metrics()

            tracker.batch_end()

            record_train_batch_stats(
                batch_idx,
                loss.item(),
                output,
                metrics_results,
                tracker,
                num_batches_per_device_train,
            )
        tracker.epoch_end()

        # Still in regular SGD
        if type(c_optimizer.optimizer) == SGD:
            metrics_values, loss = validation_round(
                val_set,
                model=model,
                batch_size=val_batch_size,
                metrics=metrics,
                loss_function=criterion,
                tracker=tracker,
                use_cuda=use_cuda,
            )
            scheduler.step()
            logger.info("Using LR={}".format(scheduler.get_last_lr()))
            if len(val_losses) > nonmono and loss > min(val_losses[:-nonmono]):
                logger.info("Switching optimizer to ASGD")
                optimizer = ASGD(
                    params=model.parameters(),
                    lr=scheduler.get_last_lr()[0],
                    lambd=0.0,
                    t0=0,
                    weight_decay=weight_decay,
                )
                c_optimizer.optimizer = optimizer

        # Switched to ASGD, no scheduling
        else:
            tmp = {}
            for prm in model.parameters():
                tmp[prm] = prm.data.clone()
                prm.data = optimizer.state[prm]["ax"].clone()

            metrics_values, loss = validation_round(
                val_set,
                model=model,
                batch_size=val_batch_size,
                metrics=metrics,
                loss_function=criterion,
                tracker=tracker,
                use_cuda=use_cuda,
            )

            for prm in model.parameters():
                prm.data = tmp[prm].clone()
        val_losses.append(loss)

        # Record validation stats
        is_best = record_validation_stats(metrics_values=metrics_values,
                                          loss=loss,
                                          tracker=tracker,
                                          rank=rank)
        checkpointer.save(tracker, model, optimizer, scheduler,
                          tracker.current_epoch, is_best)
        if tracker.goal_reached:
            print("Goal Reached!")
            dist.barrier()
            time.sleep(10)
            return
示例#6
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Main logic."""
    num_parallel_workers = 2
    max_batch_per_epoch = None
    train_epochs = 20
    batch_size = 100

    n_features = 2000

    l1_coef = 0.0
    l2_coef = 0.0000025  # Regularization 1 / train_size ( 1 / 400,000)
    dtype = "fp32"

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    lr = 4
    scaled_lr = lr * min(16, world_size)

    by_layer = False
    agg_grad = False  # According to paper, we aggregate weights after update

    model = LogisticRegression(n_features)

    # A loss_function for computing the loss
    loss_function = BCELossRegularized(l1=l1_coef, l2=l2_coef, model=model)

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    optimizer = CentralizedSGD(
        world_size=world_size,
        model=model,
        lr=scaled_lr,
        use_cuda=use_cuda,
        by_layer=by_layer,
        agg_grad=agg_grad,
    )

    metrics = [
        TopKAccuracy(),  # Binary accuracy with threshold 0.5
        F1Score(),
        DiceCoefficient(),
    ]

    train_set = LMDBDataset(name="epsilon",
                            data_type="train",
                            root=dataset_dir)
    val_set = LMDBDataset(name="epsilon", data_type="test", root=dataset_dir)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    num_batches_per_device_train = len(train_loader)

    scheduler = ReduceLROnPlateau(
        optimizer.optimizer,
        factor=0.75,
        patience=0,
        verbose=True,
        threshold_mode="abs",
        threshold=0.01,
        min_lr=lr,
    )
    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task2_time_to_accuracy_light_goal()
        else:
            goal = task2_time_to_accuracy_goal()

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()
        tracker.start()

        for epoch in range(0, train_epochs):
            # Set tracker and model in training mode
            model.train()
            tracker.train()

            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(
                    data,
                    target,
                    dtype=dtype,
                    transform_target_dtype=False,
                    use_cuda=use_cuda,
                )
                tracker.record_batch_load()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step(tracker=tracker)

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )

                tracker.record_batch_comp_metrics()

                # scheduler.batch_step()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            tracker.epoch_end()

            # Perform validation and gather results
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype=dtype,
                tracker=tracker,
                transform_target_type=False,
                use_cuda=use_cuda,
                max_batches=max_batch_per_epoch,
            )
            # Scheduler per epoch
            scheduler.step(loss)
            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)
            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)
            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
示例#7
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Train loop"""
    num_parallel_workers = 2
    max_batch_per_epoch = None
    train_epochs = 164
    batch_size = 128
    dtype = "fp32"

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # LR = 0.1 / 256 / sample
    lr = 0.02
    scaled_lr = lr * world_size
    by_layer = False

    # Create Model
    model = ResNetCIFAR(resnet_size=20,
                        bottleneck=False,
                        num_classes=10,
                        version=1)

    # Create optimizer
    optimizer = CentralizedSGD(
        world_size=world_size,
        model=model,
        lr=lr,
        momentum=0.9,
        weight_decay=1e-4,
        nesterov=False,
        use_cuda=use_cuda,
        by_layer=by_layer,
    )

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    # Create train/validation sets and loaders
    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = ReduceLROnPlateauWithWarmup(
        optimizer.optimizer,
        warmup_init_lr=lr,
        scaled_lr=scaled_lr,
        warmup_epochs=int(math.log(world_size, 2)),  # Adaptive warmup period
        factor=0.5,
        threshold_mode="abs",
        threshold=0.01,
        patience=1,
        verbose=True,
        min_lr=lr,
    )

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task1_time_to_accuracy_light_goal()
        else:
            goal = task1_time_to_accuracy_goal()

        num_batches_per_device_train = len(train_loader)

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()

        tracker.start()

        for epoch in range(0, train_epochs):
            # Set tracker and model in training mode
            model.train()
            tracker.train()

            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(
                    data,
                    target,
                    dtype=dtype,
                    transform_target_dtype=False,
                    use_cuda=use_cuda,
                )
                tracker.record_batch_load()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step(tracker=tracker)

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )
                tracker.record_batch_comp_metrics()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            # Scheduler per epoch
            tracker.epoch_end()

            # Perform validation and gather results
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype=dtype,
                tracker=tracker,
                transform_target_type=False,
                use_cuda=use_cuda,
                max_batches=max_batch_per_epoch,
            )
            scheduler.step(loss)

            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)

            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)

            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
示例#8
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
    by_layer=False,
):
    r"""Main logic."""
    num_parallel_workers = 2
    train_epochs = 164
    batch_size = 128

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    current_device = cuda.current_device()

    local_model = ResNetCIFAR(resnet_size=20,
                              bottleneck=False,
                              num_classes=10,
                              version=1).to(current_device)
    model = DDP(local_model, device_ids=[current_device])

    optimizer = SGD(
        model.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=1e-4,
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task1_time_to_accuracy_light_goal()
        else:
            goal = task1_time_to_accuracy_goal()

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()

        tracker.start()

        for epoch in range(0, train_epochs):
            model.train()
            tracker.train()

            data_iter = iterate_dataloader(train_loader,
                                           dtype="fp32",
                                           use_cuda=use_cuda)
            num_batches_per_device_train = len(train_loader)

            for batch_idx, (data, target) in enumerate(data_iter):
                tracker.batch_start()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step()
                tracker.record_batch_opt_step()

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )

                tracker.record_batch_comp_metrics()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            tracker.epoch_end()
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype="fp32",
                tracker=tracker,
                use_cuda=use_cuda,
            )

            scheduler.step()
            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)

            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)

            if tracker.goal_reached:
                print("Goal Reached!")
                time.sleep(10)
                return

    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
示例#9
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
    seed=42,
):
    """Train loop"""
    train_epochs = 10

    math_mode = "fp16"
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # Dataset arguments
    train_global_batch_size = 2**17  # Global batch size
    max_bs = 2**13  # Max batch size for used hardware
    update_freq = int(max(1, train_global_batch_size // (max_bs * world_size)))
    max_tokens = int(train_global_batch_size // (world_size * update_freq))

    max_source_positions, max_target_positions = 80, 80
    seq_len_multiple = 2
    left_pad = (True, False)
    lang = ("en", "de")

    # specific arch
    model_args = deepcopy(DEFAULT_TRANSFORMER_ARCH)
    model_args["max_source_positions"] = max_source_positions
    model_args["max_target_positions"] = max_target_positions
    model_args["share_all_embeddings"] = True
    model_args["dropout"] = 0.1
    model_args["softmax_type"] = "fast_fill"

    lr = 1.976e-3
    optimizer_args = {
        "lr": lr,
        "eps": 1e-9,
        "betas": (0.9, 0.98),
    }
    scheduler_args = {
        "base_lr": lr,
        "warmup_init_lr": 0.0,
        "warmup_steps": 1000
    }

    loss_scaling_fp16 = {
        "init_scale": 2.0**7,
        "scale_factor": 2,
        "scale_window": 2000,
    }

    criterion_args = {"smoothing": 0.1, "fast_xentropy": True}

    # Horovod stuff
    use_horovod = (math_mode
                   == "fp16") and dist.get_backend() == dist.Backend.MPI
    if use_horovod:
        hvd.init()
        logger.info("Using horovod rank={}".format(hvd.rank()))
        tensor = torch.tensor([1])
        res = hvd.allreduce(tensor, op=hvd.Sum)
        assert res[0] == world_size

    # Load train and validation datasets
    train_set = WMT17Dataset(
        dataset_dir,
        download=True,
        train=True,
        shuffle=True,
        lang=lang,
        left_pad=left_pad,
        max_positions=(max_source_positions, max_target_positions),
        seq_len_multiple=seq_len_multiple,
    )

    validation_set = WMT17Dataset(
        dataset_dir,
        download=False,
        test=True,
        shuffle=True,
        lang=lang,
        left_pad=left_pad,
        max_positions=(max_source_positions, max_target_positions),
        seq_len_multiple=seq_len_multiple,
    )
    src_dict, trg_dict = train_set.src_dict, train_set.trg_dict

    train_batches = get_batches(train_set,
                                max_tokens=max_tokens,
                                bsz_mult=8,
                                shuffle=True,
                                seed=seed)
    val_batches = get_batches(validation_set,
                              max_tokens=max_tokens,
                              bsz_mult=8,
                              shuffle=False)

    train_batches = equalize_batches(train_batches, world_size, seed=seed)

    # Partition by rank
    train_batches = partition_dataset_by_rank(train_batches, rank, world_size)
    val_batches = partition_dataset_by_rank(val_batches, rank, world_size)

    total_train_points = sum(len(b) for b in train_batches)

    validate_every = update_freq * round(
        len(train_batches) * 0.30 / update_freq)  # Validate every 30%

    assert (validate_every % update_freq) == 0
    logger.info("Using {} total train points, {} batches".format(
        total_train_points, len(train_batches)))

    train_loader = DataLoader(
        train_set,
        num_workers=1,
        pin_memory=False,
        collate_fn=train_set.collater,
        batch_sampler=train_batches,
    )

    val_loader = DataLoader(
        validation_set,
        num_workers=1,
        pin_memory=False,
        collate_fn=validation_set.collater,
        batch_sampler=val_batches,
    )

    model = TransformerModel(Arguments(model_args), src_dict, trg_dict)
    criterion = LabelSmoothing(padding_idx=src_dict.pad(), **criterion_args)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    fp_optimizer, optimizer, model = build_optimizer(
        model,
        optimizer_args,
        math_mode=math_mode,
        scaling_args=loss_scaling_fp16,
        use_horovod=use_horovod,
        use_cuda=use_cuda,
    )

    scheduler = SQRTTimeDecayLRWithWarmup(optimizer, **scheduler_args)

    metrics = [BLEUScore(use_raw=True)]
    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.BEST)

    translator = SequenceGenerator(
        model,
        src_dict=deepcopy(src_dict),
        trg_dict=deepcopy(trg_dict),
        beam_size=4,
        stop_early=True,
        normalize_scores=True,
        len_penalty=0.6,
        sampling=False,
        sampling_topk=-1,
        minlen=1,
    )
    if not validation_only:

        if light_target:
            goal = task4_time_to_bleu_goal(20)
        else:
            goal = task4_time_to_bleu_goal(25)

        num_batches_per_device_train = len(train_loader)
        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()
        tracker.start()

        for epoch in range(0, train_epochs):
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            model.train()
            tracker.train()

            iter_sample_size = 0
            for batch_idx, sample in enumerate(train_loader):
                tracker.batch_start()

                sample = prepare_batch(sample, use_cuda=use_cuda)
                tracker.record_batch_load()

                is_last = batch_idx == len(train_loader)
                update = (batch_idx % update_freq) == update_freq - 1
                init = (batch_idx % update_freq) == 0

                # Clear gradients in the optimizer.
                if init:
                    fp_optimizer.zero_grad()
                    iter_sample_size = 0
                    tracker.record_batch_init()

                # Compute the output
                output = model(**sample["net_input"])
                tracker.record_batch_fwd_pass()

                loss, sample_size = compute_loss(sample, output, criterion)
                loss_per_sample = loss.item() / sample_size
                iter_sample_size += sample_size
                tracker.record_batch_comp_loss()

                # Backprop
                fp_optimizer.backward_loss(loss)
                tracker.record_batch_backprop()

                if update or is_last:
                    # Get batch size over all workers
                    full_bs = get_full_batch_size(iter_sample_size,
                                                  world_size=world_size,
                                                  use_cuda=use_cuda)

                    updated = opt_step(
                        fp_optimizer,
                        tracker,
                        full_bs,
                        update_freq,
                        math_mode,
                        world_size,
                    )

                    if updated:
                        scheduler.step()

                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx=batch_idx,
                    loss=loss_per_sample,
                    output=torch.Tensor([0]),
                    metric_results={},
                    tracker=tracker,
                    num_batches_per_device_train=num_batches_per_device_train,
                )

                if (batch_idx + 1) % validate_every == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    metric_values, loss = validation_round(
                        val_loader,
                        metrics,
                        criterion,
                        translator,
                        tracker=tracker,
                        use_cuda=use_cuda,
                    )
                    record_validation_stats(metric_values, loss, tracker, rank)
                    if tracker.goal_reached:
                        break

                    model.train()
                    tracker.train()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            metric_values, loss = validation_round(
                val_loader,
                metrics,
                criterion,
                translator,
                tracker=tracker,
                use_cuda=use_cuda,
            )
            is_best = record_validation_stats(metric_values, loss, tracker,
                                              rank)
            checkpointer.save(
                tracker,
                model,
                optimizer,
                scheduler,
                tracker.current_epoch,
                is_best,
            )
            tracker.epoch_end()

            if tracker.goal_reached:
                print("Goal Reached!")
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=criterion,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)