示例#1
0
def train_epoch(model,
                training_data,
                optimizer,
                device,
                epoch,
                tb=None,
                log_interval=100):
    model.train()

    total_loss = 0
    n_char_total = 0
    n_char_correct = 0

    for batch_idx, batch in enumerate(
            tqdm(training_data, mininterval=2, leave=False)):
        batch_qs, batch_qs_pos, batch_as, batch_as_pos = map(
            lambda x: x.to(device), batch)
        gold_as = batch_as[:, 1:]

        optimizer.zero_grad()

        pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos)

        loss, n_correct = compute_performance(pred_as, gold_as, smoothing=True)
        loss.backward()

        # update parameters
        optimizer.step()

        # note keeping
        total_loss += loss.item()

        non_pad_mask = gold_as.ne(Constants.PAD)
        n_char = non_pad_mask.sum().item()
        n_char_total += n_char
        n_char_correct += n_correct

        if tb is not None and batch_idx % log_interval == 0:
            tb.add_scalars(
                {
                    "loss_per_char": total_loss / n_char_total,
                    "accuracy": n_char_correct / n_char_total,
                },
                group="train",
                sub_group="batch",
                global_step=epoch * len(training_data) + batch_idx)

    loss_per_char = total_loss / n_char_total
    accuracy = n_char_correct / n_char_total

    if tb is not None:
        tb.add_scalars({
            "loss_per_char": loss_per_char,
            "accuracy": accuracy,
        },
                       group="train",
                       sub_group="epoch",
                       global_step=epoch)

    return loss_per_char, accuracy
示例#2
0
def eval_epoch(model,
               validation_data,
               device,
               graph_pool,
               epoch,
               tb=None,
               log_interval=100):
    model.eval()

    total_loss = 0
    n_char_total = 0
    n_char_correct = 0

    with torch.no_grad():
        for batch_idx, batch in enumerate(
                tqdm(validation_data, mininterval=2, leave=False)):
            # prepare data
            #batch_qs, batch_as = batch
            #gold_as = torch.tensor(batch_as[:, 1:]).to(device)

            #g = graph_pool(batch_qs, batch_as, device=device)

            gold_as, g = batch
            gold_as = gold_as.to(device)
            g = graph_to_device(g, device)

            # forward
            pred_as = model(g)
            loss, n_correct = compute_performance(pred_as,
                                                  gold_as,
                                                  smoothing=False)

            # note keeping
            total_loss += loss.item()

            non_pad_mask = gold_as.ne(Constants.PAD)
            n_char = non_pad_mask.sum().item()
            n_char_total += n_char
            n_char_correct += n_correct

    loss_per_char = total_loss / n_char_total
    accuracy = n_char_correct / n_char_total

    if tb is not None:
        tb.add_scalars({
            "loss_per_char": loss_per_char,
            "accuracy": accuracy,
        },
                       group="eval",
                       sub_group="epoch",
                       global_step=epoch)

    return loss_per_char, accuracy
示例#3
0
def inference_epoch(model,
                    data,
                    device,
                    epoch,
                    group,
                    tb=None,
                    log_interval=100):
    model.eval()

    total_loss = 0
    n_char_total = 0
    n_char_correct = 0

    with torch.no_grad():
        for batch_idx, batch in enumerate(
                tqdm(data, mininterval=2, leave=False)):
            # prepare data
            batch_qs, batch_qs_pos, batch_as, batch_as_pos = map(
                lambda x: x.to(device), batch)
            gold_as = batch_as[:, 1:]

            # forward
            pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos)
            loss, n_correct = compute_performance(pred_as,
                                                  gold_as,
                                                  smoothing=False)

            # note keeping
            total_loss += loss.item()

            non_pad_mask = gold_as.ne(Constants.PAD)
            n_char = non_pad_mask.sum().item()
            n_char_total += n_char
            n_char_correct += n_correct

    loss_per_char = total_loss / n_char_total
    accuracy = n_char_correct / n_char_total

    if tb is not None:
        tb.add_scalars(
            {
                "loss_per_char": loss_per_char,
                "accuracy": accuracy
            },
            group=group,
            sub_group="epoch",
            global_step=epoch,
        )

    return loss_per_char, accuracy
示例#4
0
def train_epoch(
    model,
    name,
    training_data,
    optimizer,
    device,
    epoch,
    tb=None,
    log_interval=100,
    max_batches=None,
    run_batch_count=0,
    start_batch=0,
    total_loss=0,
    n_char_total=0,
    n_char_correct=0,
    lr=None,
    warmup_lr=None,
    warmup_interval=None,
    smoothing=False,
):

    training_iter = iter(training_data)

    if start_batch > 0:
        last_question = np_encode_string(
            training_data.dataset.__getitem__(-1)["q"])
        print(f"Final question before checkpoint was {last_question}")

    model.train()
    # interrupted_batch = None
    done = False

    loss_per_char = 0
    accuracy = 0

    for batch_idx, batch in enumerate(training_iter, start=start_batch):
        if utils.is_preempted():
            print("Exiting...")
            sys.exit(0)

        if warmup_interval is not None and batch_idx == warmup_interval:
            print(
                f"End of warmup. Swapping learning rates from {warmup_lr} to {lr}"
            )
            for param_group in optimizer.param_groups:
                warmup_lr = lr
                param_group["lr"] = lr

        batch_qs, batch_qs_pos, batch_as, batch_as_pos = map(
            lambda x: x.to(device), batch)

        gold_as = batch_as[:, 1:]

        optimizer.zero_grad()
        pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos)

        loss, n_correct = compute_performance(pred_as,
                                              gold_as,
                                              smoothing=smoothing)

        loss.backward()

        # Clip gradients, paper uses 0.1
        clip_grad_value_(model.parameters(), 0.1)

        # update parameters
        optimizer.step()

        # note keeping
        total_loss += loss.item()

        non_pad_mask = gold_as.ne(Constants.PAD)
        n_char = non_pad_mask.sum().item()
        n_char_total += n_char
        n_char = n_char if n_char > 1 else 1

        batch_loss = loss / n_char
        loss_per_char = total_loss / n_char_total

        n_char_correct += n_correct
        batch_acc = n_correct / n_char
        accuracy = n_char_correct / n_char_total
        print(
            f"Batch: {batch_idx}. Acc: {accuracy:.6f}. Loss: {loss_per_char:.6f}. Batch_acc: {batch_acc:.6f}. Batch_loss: {batch_loss:.6f} "
        )

        # TODO: automatically trim the TB logs that go beyond the preempted checkpoint
        if tb is not None and batch_idx % log_interval == 0:
            tb.add_scalars(
                {
                    "loss_per_char": loss_per_char,
                    "accuracy": accuracy,
                    "batch_loss": batch_loss,
                    "batch_acc": batch_acc,
                },
                group="train",
                sub_group="batch",
                global_step=run_batch_count,
            )

        run_batch_count += 1

        if max_batches is not None and run_batch_count == max_batches:
            print(
                f"Reached {run_batch_count} batches on max_batches of {max_batches}. Breaking from epoch."
            )
            # interrupted_batch = batch_idx
            done = True
            break

        if batch_idx % 251 == 0 and batch_idx != 0:
            print(
                f"Checkpointing on batch: {batch_idx}. Accuracy: {accuracy}. Loss per char: {loss_per_char}. Time: {time.time()}"
            )
            print(f"Last question is {batch_qs[-1]}")

            state = build_checkpoint(
                name=name,
                model=model,
                optimizer=optimizer,
                acc=accuracy,
                loss=loss_per_char,
                epoch=epoch,
                run_batches=run_batch_count,
                start_batch=batch_idx + 1,
                total_loss=total_loss,
                n_char_total=n_char_total,
                n_char_correct=n_char_correct,
                lr=warmup_lr,
            )

            save_checkpoint(state=state,
                            name=f"{name}_latest_checkpoint",
                            path="./checkpoints")

        # if utils.is_preempted():
        #     print(
        #         f"Preemption at end of Epoch batch: {batch_idx} and new Run batch: {run_batch_count}. Breaking from epoch."
        #     )
        #     interrupted_batch = batch_idx
        #     break

    if tb is not None and not utils.is_preempted():
        tb.add_scalars(
            {
                "loss_per_char": loss_per_char,
                "accuracy": accuracy
            },
            group="train",
            sub_group="epoch",
            global_step=epoch,
        )

    return loss_per_char, accuracy, run_batch_count, done