Exemplo n.º 1
0
def val_step(model, val_loader, device, num_batches=None,
             log_interval: int = 100):

    """
    Performs one step of validation. Calculates loss, forward pass and returns metrics.
    Args:
        model : PyTorch FasterRCNN Model.
        val_loader : Validation loader.
        device : "cuda" or "cpu"
        num_batches : (optional) Integer To limit validation to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
    """

    model = model.to(device)
    start_val_step = time.time()
    last_idx = len(val_loader) - 1
    batch_time_m = utils.AverageMeter()
    cnt = 0
    model.eval()
    batch_start = time.time()
    metrics = OrderedDict()

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            last_batch = batch_idx == last_idx
            images = list(image.to(device) for image in inputs)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            out = model(images)
            iou = torch.stack([_evaluate_iou(t, o) for t, o in zip(targets, out)]).mean()
            giou = torch.stack([_evaluate_giou(t, o) for t, o in zip(targets, out)]).mean()

            cnt += 1
            batch_time_m.update(time.time() - batch_start)
            batch_start = time.time()

            if last_batch or batch_idx % log_interval == 0:  # If we reach the log intervel
                print("Batch Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  ".format(
                      batch_time=batch_time_m,))

            if num_batches is not None:
                if cnt >= num_batches:
                    avg_iou = torch.stack([iou]).mean()
                    avg_giou = torch.stack([giou]).mean()
                    metrics["iou"] = avg_iou
                    metrics["giou"] = avg_giou
                    print(f"Done till {num_batches} Validation batches")
                    end_val_step = time.time()
                    print(f"Time taken for validation step = {end_val_step - start_val_step} sec")
                    return metrics

    avg_iou = torch.stack([iou]).mean()
    avg_giou = torch.stack([giou]).mean()
    metrics["iou"] = avg_iou
    metrics["giou"] = avg_giou

    end_val_step = time.time()
    print(f"Time taken for validation step = {end_val_step - start_val_step} sec")
    return metrics
Exemplo n.º 2
0
def train_step(
    model: nn.Module,
    train_loader,
    device: str,
    optimizer,
    scheduler=None,
    num_batches: int = None,
    log_interval: int = 100,
    scaler=None,
):
    """
    Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics.
    Args:
        model : PyTorch Faster RCNN Model.
        train_loader : Train loader.
        device : "cuda" or "cpu"
        optimizer : Torch optimizer to train.
        scheduler : Learning rate scheduler.
        num_batches : (optional) Integer To limit training to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
        grad_penalty : (optional) To penalize with l2 norm for big gradients.
        scaler: (optional)  Pass torch.cuda.amp.GradScaler() for fp16 precision Training.
    """

    model = model.to(device)
    start_train_step = time.time()

    model.train()
    last_idx = len(train_loader) - 1
    batch_time_m = utils.AverageMeter()
    cnt = 0
    batch_start = time.time()
    metrics = OrderedDict()

    total_loss = utils.AverageMeter()
    loss_classifier = utils.AverageMeter()
    loss_box_reg = utils.AverageMeter()
    loss_objectness = utils.AverageMeter()
    loss_rpn_box_reg = utils.AverageMeter()

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        last_batch = batch_idx == last_idx
        images = list(image.to(device) for image in inputs)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # zero the parameter gradients
        optimizer.zero_grad()
        if scaler is not None:
            with amp.autocast():
                loss_dict = model(images, targets)
                loss = sum(loss_v for loss_v in loss_dict.values())
                scaler.scale(loss).backward()
                # Step using scaler.step()
                scaler.step(optimizer)
                # Update for next iteration
                scaler.update()
        else:
            loss_dict = model(images, targets)
            loss = sum(loss_v for loss_v in loss_dict.values())
            loss.backward()
            optimizer.step()

        if scheduler is not None:
            scheduler.step()

        cnt += 1

        total_loss.update(loss.item())
        loss_classifier.update(loss_dict["loss_classifier"].item())
        loss_box_reg.update(loss_dict["loss_box_reg"].item())
        loss_objectness.update(loss_dict["loss_objectness"].item())
        loss_rpn_box_reg.update(loss_dict["loss_rpn_box_reg"].item())

        batch_time_m.update(time.time() - batch_start)
        batch_start = time.time()
        if last_batch or batch_idx % log_interval == 0:  # If we reach the log intervel
            print(
                "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
                .format(batch_time=batch_time_m, ))


#   "loss classifier: {loss_d.loss_classifier:>7.4f} "
#   "loss box_reg:  {loss_d.loss_box_reg:>7.4f} "
#   "loss objectness: {loss_d.loss_objectness:>7.4f} "
#   "loss rpn box reg: {loss_d.loss_rpn_box_reg:>7.4f}"

        if num_batches is not None:
            if cnt >= num_batches:
                end_train_step = time.time()
                metrics["total_loss"] = total_loss.avg
                metrics["loss_classifier"] = loss_classifier.avg
                metrics["loss_box_reg"] = loss_box_reg.avg
                metrics["loss_objectness"] = loss_objectness.avg
                metrics["loss_rpn_box_reg"] = loss_rpn_box_reg.avg

                print(f"Done till {num_batches} train batches")
                print(
                    f"Time taken for Training step = {end_train_step - start_train_step} sec"
                )
                return metrics

    end_train_step = time.time()
    metrics["total_loss"] = total_loss.avg
    metrics["loss_classifier"] = loss_classifier.avg
    metrics["loss_box_reg"] = loss_box_reg.avg
    metrics["loss_objectness"] = loss_objectness.avg
    metrics["loss_rpn_box_reg"] = loss_rpn_box_reg.avg
    print(
        f"Time taken for Training step = {end_train_step - start_train_step} sec"
    )
    return metrics
Exemplo n.º 3
0
def train_step(
    model,
    train_loader,
    criterion,
    device,
    optimizer,
    scheduler=None,
    num_batches: int = None,
    log_interval: int = 100,
    grad_penalty: bool = False,
    scaler=None,
):
    """
    Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics.
    Args:
        model : A pytorch CNN Model.
        train_loader : Train loader.
        criterion : Loss function to be optimized.
        device : "cuda" or "cpu"
        optimizer : Torch optimizer to train.
        scheduler : Learning rate scheduler.
        num_batches : (optional) Integer To limit training to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
        grad_penalty : (optional) To penalize with l2 norm for big gradients.
        scaler: (optional)  Pass torch.cuda.amp.GradScaler() for fp16 precision Training.
    """

    model = model.to(device)
    start_train_step = time.time()
    metrics = OrderedDict()
    model.train()
    last_idx = len(train_loader) - 1
    batch_time_m = utils.AverageMeter()
    # data_time_m = utils.AverageMeter()
    losses_m = utils.AverageMeter()
    top1_m = utils.AverageMeter()
    top5_m = utils.AverageMeter()
    cnt = 0
    batch_start = time.time()
    # num_updates = epoch * len(loader)

    for batch_idx, (inputs, target) in enumerate(train_loader):
        last_batch = batch_idx == last_idx
        # data_time_m.update(time.time() - batch_start)
        inputs = inputs.to(device)
        target = target.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        if scaler is not None:
            with amp.autocast():
                output = model(inputs)
                loss = criterion(output, target)
                # Scale the loss using Grad Scaler

            if grad_penalty is True:
                # Scales the loss for autograd.grad's backward pass, resulting in scaled grad_params
                scaled_grad_params = torch.autograd.grad(scaler.scale(loss),
                                                         model.parameters(),
                                                         create_graph=True)
                # Creates unscaled grad_params before computing the penalty. scaled_grad_params are
                # not owned by any optimizer, so ordinary division is used instead of scaler.unscale_:
                inv_scale = 1.0 / scaler.get_scale()
                grad_params = [p * inv_scale for p in scaled_grad_params]
                # Computes the penalty term and adds it to the loss
                with amp.autocast():
                    grad_norm = 0
                    for grad in grad_params:
                        grad_norm += grad.pow(2).sum()

                    grad_norm = grad_norm.sqrt()
                    loss = loss + grad_norm

            scaler.scale(loss).backward()
            # Step using scaler.step()
            scaler.step(optimizer)
            # Update for next iteration
            scaler.update()

        else:
            output = model(inputs)
            loss = criterion(output, target)

            if grad_penalty is True:
                # Create gradients
                grad_params = torch.autograd.grad(loss,
                                                  model.parameters(),
                                                  create_graph=True)
                # Compute the L2 Norm as penalty and add that to loss
                grad_norm = 0
                for grad in grad_params:
                    grad_norm += grad.pow(2).sum()
                grad_norm = grad_norm.sqrt()
                loss = loss + grad_norm

            loss.backward()
            optimizer.step()

        if scheduler is not None:
            scheduler.step()

        cnt += 1
        acc1, acc5 = accuracy(output, target, topk=(1, 5))

        top1_m.update(acc1.item(), output.size(0))
        top5_m.update(acc5.item(), output.size(0))
        losses_m.update(loss.item(), inputs.size(0))

        batch_time_m.update(time.time() - batch_start)
        batch_start = time.time()
        if last_batch or batch_idx % log_interval == 0:  # If we reach the log intervel
            print(
                "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
                "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  "
                "Top 1 Accuracy: {top1.val:>7.4f} ({top1.avg:>7.4f})  "
                "Top 5 Accuracy: {top5.val:>7.4f} ({top5.avg:>7.4f})".format(
                    batch_time=batch_time_m,
                    loss=losses_m,
                    top1=top1_m,
                    top5=top5_m))

        if num_batches is not None:
            if cnt >= num_batches:
                end_train_step = time.time()
                metrics["loss"] = losses_m.avg
                metrics["top1"] = top1_m.avg
                metrics["top5"] = top5_m.avg
                print(f"Done till {num_batches} train batches")
                print(
                    f"Time taken for train step = {end_train_step - start_train_step} sec"
                )
                return metrics

    metrics["loss"] = losses_m.avg
    metrics["top1"] = top1_m.avg
    metrics["top5"] = top5_m.avg
    end_train_step = time.time()
    print(
        f"Time taken for train step = {end_train_step - start_train_step} sec")
    return metrics
Exemplo n.º 4
0
def val_step(model,
             val_loader,
             criterion,
             device,
             num_batches=None,
             log_interval: int = 100):
    """
    Performs one step of validation. Calculates loss, forward pass and returns metrics.
    Args:
        model : A pytorch CNN Model.
        val_loader : Validation loader.
        criterion : Loss function to be optimized.
        device : "cuda" or "cpu"
        num_batches : (optional) Integer To limit validation to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
    """

    model = model.to(device)
    start_val_step = time.time()
    last_idx = len(val_loader) - 1
    batch_time_m = utils.AverageMeter()
    # data_time_m = utils.AverageMeter()
    losses_m = utils.AverageMeter()
    top1_m = utils.AverageMeter()
    top5_m = utils.AverageMeter()
    cnt = 0
    model.eval()
    batch_start = time.time()
    metrics = OrderedDict()
    with torch.no_grad():
        for batch_idx, (inputs, target) in enumerate(val_loader):
            last_batch = batch_idx == last_idx
            inputs = inputs.to(device)
            target = target.to(device)

            output = model(inputs)
            if isinstance(output, (tuple, list)):
                output = output[0]

            loss = criterion(output, target)
            cnt += 1
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            reduced_loss = loss.data

            losses_m.update(reduced_loss.item(), inputs.size(0))
            top1_m.update(acc1.item(), output.size(0))
            top5_m.update(acc5.item(), output.size(0))
            batch_time_m.update(time.time() - batch_start)

            batch_start = time.time()

            if (last_batch or batch_idx % log_interval
                    == 0):  # If we reach the log intervel
                print(
                    "Batch Inference Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
                    "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  "
                    "Top 1 Accuracy: {top1.val:>7.4f} ({top1.avg:>7.4f})  "
                    "Top 5 Accuracy: {top5.val:>7.4f} ({top5.avg:>7.4f})".
                    format(batch_time=batch_time_m,
                           loss=losses_m,
                           top1=top1_m,
                           top5=top5_m))

            if num_batches is not None:
                if cnt >= num_batches:
                    end_val_step = time.time()
                    metrics["loss"] = losses_m.avg
                    metrics["top1"] = top1_m.avg
                    metrics["top5"] = top5_m.avg
                    print(f"Done till {num_batches} validation batches")
                    print(
                        f"Time taken for validation step = {end_val_step - start_val_step} sec"
                    )
                    return metrics

        metrics["loss"] = losses_m.avg
        metrics["top1"] = top1_m.avg
        metrics["top5"] = top5_m.avg
        print("Finished the validation epoch")

    end_val_step = time.time()
    print(
        f"Time taken for validation step = {end_val_step - start_val_step} sec"
    )
    return metrics
Exemplo n.º 5
0
def train_step(
    model: nn.Module,
    train_loader,
    criterion,
    device: str,
    optimizer,
    scheduler=None,
    num_batches: int = None,
    log_interval: int = 100,
    scaler=None,
):
    """
    Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics.
    Args:
        model : PyTorch Detr Model.
        train_loader : Train loader.
        device : "cuda" or "cpu"
        criterion : Detr Loss function to be optimized.
        optimizer : Torch optimizer to train.
        scheduler : Learning rate scheduler.
        num_batches : (optional) Integer To limit training to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
        scaler: (optional)  Pass torch.cuda.amp.GradScaler() for fp16 precision Training.
    """

    model = model.to(device)
    criterion = criterion.to(device)
    start_train_step = time.time()
    model.train()
    last_idx = len(train_loader) - 1
    batch_time_m = utils.AverageMeter()
    criterion.train()
    cnt = 0
    batch_start = time.time()
    metrics = OrderedDict()

    total_loss = utils.AverageMeter()
    bbox_loss = utils.AverageMeter()
    giou_loss = utils.AverageMeter()
    labels_loss = utils.AverageMeter()

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        last_batch = batch_idx == last_idx
        images = list(image.to(device) for image in inputs)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()

        if scaler is not None:
            with amp.autocast():
                outputs = model(images)
                loss_dict = criterion(outputs, targets)
                weight_dict = criterion.weight_dict
                loss = sum(loss_dict[k] * weight_dict[k]
                           for k in loss_dict.keys() if k in weight_dict)
                scaler.scale(loss).backward()
                # Step using scaler.step()
                scaler.step(optimizer)
                # Update for next iteration
                scaler.update()

        else:
            outputs = model(images)
            loss_dict = criterion(outputs, targets)
            weight_dict = criterion.weight_dict
            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys()
                       if k in weight_dict)
            loss.backward()
            optimizer.step()

        if scheduler is not None:
            scheduler.step()

        cnt += 1
        total_loss.update(loss.item())
        bbox_loss.update(loss_dict["loss_bbox"].item())
        giou_loss.update(loss_dict["loss_giou"].item())
        labels_loss.update(loss_dict["loss_ce"].item())

        batch_time_m.update(time.time() - batch_start)
        batch_start = time.time()

        if last_batch or batch_idx % log_interval == 0:  # If we reach the log intervel
            print(
                "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
                .format(batch_time=batch_time_m, ))

        if num_batches is not None:
            if cnt >= num_batches:
                end_train_step = time.time()
                metrics["total_loss"] = total_loss.avg
                metrics["bbox_loss"] = bbox_loss.avg
                metrics["giou_loss"] = giou_loss.avg
                metrics["labels_loss"] = labels_loss.avg

                print(f"Done till {num_batches} train batches")
                print(
                    f"Time taken for Training step = {end_train_step - start_train_step} sec"
                )
                return metrics

    end_train_step = time.time()
    metrics["total_loss"] = total_loss.avg
    metrics["bbox_loss"] = bbox_loss.avg
    metrics["giou_loss"] = giou_loss.avg
    metrics["labels_loss"] = labels_loss.avg
    print(
        f"Time taken for Training step = {end_train_step - start_train_step} sec"
    )
    return metrics
Exemplo n.º 6
0
def val_step(model: nn.Module,
             val_loader,
             criterion,
             device,
             num_batches: int = None,
             log_interval: int = 100):
    """
    Performs one step of validation. Calculates loss, forward pass and returns metrics.
    Args:
        model : PyTorch Detr Model.
        val_loader : Validation loader.
        criterion : Detr Loss function to be optimized.
        device : "cuda" or "cpu"
        num_batches : (optional) Integer To limit validation to certain number of batches.
        log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch.
    """

    model = model.to(device)
    start_val_step = time.time()
    last_idx = len(val_loader) - 1
    batch_time_m = utils.AverageMeter()
    cnt = 0
    model.eval()
    criterion.eval()
    batch_start = time.time()
    metrics = OrderedDict()

    total_loss = utils.AverageMeter()
    bbox_loss = utils.AverageMeter()
    giou_loss = utils.AverageMeter()
    labels_loss = utils.AverageMeter()

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            last_batch = batch_idx == last_idx
            images = list(image.to(device) for image in inputs)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            outputs = model(images)
            loss_dict = criterion(outputs, targets)
            weight_dict = criterion.weight_dict
            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys()
                       if k in weight_dict)

            cnt += 1
            total_loss.update(loss.item())
            bbox_loss.update(loss_dict["loss_bbox"].item())
            giou_loss.update(loss_dict["loss_giou"].item())
            labels_loss.update(loss_dict["loss_ce"].item())

            batch_time_m.update(time.time() - batch_start)
            batch_start = time.time()

            if last_batch or batch_idx % log_interval == 0:  # If we reach the log intervel
                print(
                    "Batch Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  "
                    .format(batch_time=batch_time_m, ))

            if num_batches is not None:
                if cnt >= num_batches:
                    end_val_step = time.time()
                    metrics["total_loss"] = total_loss.avg
                    metrics["bbox_loss"] = bbox_loss.avg
                    metrics["giou_loss"] = giou_loss.avg
                    metrics["labels_loss"] = labels_loss.avg
                    print(f"Done till {num_batches} Validation batches")
                    print(
                        f"Time taken for validation step = {end_val_step - start_val_step} sec"
                    )
                    return metrics

    end_val_step = time.time()
    metrics["total_loss"] = total_loss.avg
    metrics["bbox_loss"] = bbox_loss.avg
    metrics["giou_loss"] = giou_loss.avg
    metrics["labels_loss"] = labels_loss.avg
    print(
        f"Time taken for validation step = {end_val_step - start_val_step} sec"
    )
    return metrics