def val_step(model, val_loader, device, num_batches=None, log_interval: int = 100): """ Performs one step of validation. Calculates loss, forward pass and returns metrics. Args: model : PyTorch FasterRCNN Model. val_loader : Validation loader. device : "cuda" or "cpu" num_batches : (optional) Integer To limit validation to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. """ model = model.to(device) start_val_step = time.time() last_idx = len(val_loader) - 1 batch_time_m = utils.AverageMeter() cnt = 0 model.eval() batch_start = time.time() metrics = OrderedDict() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(val_loader): last_batch = batch_idx == last_idx images = list(image.to(device) for image in inputs) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] out = model(images) iou = torch.stack([_evaluate_iou(t, o) for t, o in zip(targets, out)]).mean() giou = torch.stack([_evaluate_giou(t, o) for t, o in zip(targets, out)]).mean() cnt += 1 batch_time_m.update(time.time() - batch_start) batch_start = time.time() if last_batch or batch_idx % log_interval == 0: # If we reach the log intervel print("Batch Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ".format( batch_time=batch_time_m,)) if num_batches is not None: if cnt >= num_batches: avg_iou = torch.stack([iou]).mean() avg_giou = torch.stack([giou]).mean() metrics["iou"] = avg_iou metrics["giou"] = avg_giou print(f"Done till {num_batches} Validation batches") end_val_step = time.time() print(f"Time taken for validation step = {end_val_step - start_val_step} sec") return metrics avg_iou = torch.stack([iou]).mean() avg_giou = torch.stack([giou]).mean() metrics["iou"] = avg_iou metrics["giou"] = avg_giou end_val_step = time.time() print(f"Time taken for validation step = {end_val_step - start_val_step} sec") return metrics
def train_step( model: nn.Module, train_loader, device: str, optimizer, scheduler=None, num_batches: int = None, log_interval: int = 100, scaler=None, ): """ Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics. Args: model : PyTorch Faster RCNN Model. train_loader : Train loader. device : "cuda" or "cpu" optimizer : Torch optimizer to train. scheduler : Learning rate scheduler. num_batches : (optional) Integer To limit training to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. grad_penalty : (optional) To penalize with l2 norm for big gradients. scaler: (optional) Pass torch.cuda.amp.GradScaler() for fp16 precision Training. """ model = model.to(device) start_train_step = time.time() model.train() last_idx = len(train_loader) - 1 batch_time_m = utils.AverageMeter() cnt = 0 batch_start = time.time() metrics = OrderedDict() total_loss = utils.AverageMeter() loss_classifier = utils.AverageMeter() loss_box_reg = utils.AverageMeter() loss_objectness = utils.AverageMeter() loss_rpn_box_reg = utils.AverageMeter() for batch_idx, (inputs, targets) in enumerate(train_loader): last_batch = batch_idx == last_idx images = list(image.to(device) for image in inputs) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # zero the parameter gradients optimizer.zero_grad() if scaler is not None: with amp.autocast(): loss_dict = model(images, targets) loss = sum(loss_v for loss_v in loss_dict.values()) scaler.scale(loss).backward() # Step using scaler.step() scaler.step(optimizer) # Update for next iteration scaler.update() else: loss_dict = model(images, targets) loss = sum(loss_v for loss_v in loss_dict.values()) loss.backward() optimizer.step() if scheduler is not None: scheduler.step() cnt += 1 total_loss.update(loss.item()) loss_classifier.update(loss_dict["loss_classifier"].item()) loss_box_reg.update(loss_dict["loss_box_reg"].item()) loss_objectness.update(loss_dict["loss_objectness"].item()) loss_rpn_box_reg.update(loss_dict["loss_rpn_box_reg"].item()) batch_time_m.update(time.time() - batch_start) batch_start = time.time() if last_batch or batch_idx % log_interval == 0: # If we reach the log intervel print( "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " .format(batch_time=batch_time_m, )) # "loss classifier: {loss_d.loss_classifier:>7.4f} " # "loss box_reg: {loss_d.loss_box_reg:>7.4f} " # "loss objectness: {loss_d.loss_objectness:>7.4f} " # "loss rpn box reg: {loss_d.loss_rpn_box_reg:>7.4f}" if num_batches is not None: if cnt >= num_batches: end_train_step = time.time() metrics["total_loss"] = total_loss.avg metrics["loss_classifier"] = loss_classifier.avg metrics["loss_box_reg"] = loss_box_reg.avg metrics["loss_objectness"] = loss_objectness.avg metrics["loss_rpn_box_reg"] = loss_rpn_box_reg.avg print(f"Done till {num_batches} train batches") print( f"Time taken for Training step = {end_train_step - start_train_step} sec" ) return metrics end_train_step = time.time() metrics["total_loss"] = total_loss.avg metrics["loss_classifier"] = loss_classifier.avg metrics["loss_box_reg"] = loss_box_reg.avg metrics["loss_objectness"] = loss_objectness.avg metrics["loss_rpn_box_reg"] = loss_rpn_box_reg.avg print( f"Time taken for Training step = {end_train_step - start_train_step} sec" ) return metrics
def train_step( model, train_loader, criterion, device, optimizer, scheduler=None, num_batches: int = None, log_interval: int = 100, grad_penalty: bool = False, scaler=None, ): """ Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics. Args: model : A pytorch CNN Model. train_loader : Train loader. criterion : Loss function to be optimized. device : "cuda" or "cpu" optimizer : Torch optimizer to train. scheduler : Learning rate scheduler. num_batches : (optional) Integer To limit training to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. grad_penalty : (optional) To penalize with l2 norm for big gradients. scaler: (optional) Pass torch.cuda.amp.GradScaler() for fp16 precision Training. """ model = model.to(device) start_train_step = time.time() metrics = OrderedDict() model.train() last_idx = len(train_loader) - 1 batch_time_m = utils.AverageMeter() # data_time_m = utils.AverageMeter() losses_m = utils.AverageMeter() top1_m = utils.AverageMeter() top5_m = utils.AverageMeter() cnt = 0 batch_start = time.time() # num_updates = epoch * len(loader) for batch_idx, (inputs, target) in enumerate(train_loader): last_batch = batch_idx == last_idx # data_time_m.update(time.time() - batch_start) inputs = inputs.to(device) target = target.to(device) # zero the parameter gradients optimizer.zero_grad() if scaler is not None: with amp.autocast(): output = model(inputs) loss = criterion(output, target) # Scale the loss using Grad Scaler if grad_penalty is True: # Scales the loss for autograd.grad's backward pass, resulting in scaled grad_params scaled_grad_params = torch.autograd.grad(scaler.scale(loss), model.parameters(), create_graph=True) # Creates unscaled grad_params before computing the penalty. scaled_grad_params are # not owned by any optimizer, so ordinary division is used instead of scaler.unscale_: inv_scale = 1.0 / scaler.get_scale() grad_params = [p * inv_scale for p in scaled_grad_params] # Computes the penalty term and adds it to the loss with amp.autocast(): grad_norm = 0 for grad in grad_params: grad_norm += grad.pow(2).sum() grad_norm = grad_norm.sqrt() loss = loss + grad_norm scaler.scale(loss).backward() # Step using scaler.step() scaler.step(optimizer) # Update for next iteration scaler.update() else: output = model(inputs) loss = criterion(output, target) if grad_penalty is True: # Create gradients grad_params = torch.autograd.grad(loss, model.parameters(), create_graph=True) # Compute the L2 Norm as penalty and add that to loss grad_norm = 0 for grad in grad_params: grad_norm += grad.pow(2).sum() grad_norm = grad_norm.sqrt() loss = loss + grad_norm loss.backward() optimizer.step() if scheduler is not None: scheduler.step() cnt += 1 acc1, acc5 = accuracy(output, target, topk=(1, 5)) top1_m.update(acc1.item(), output.size(0)) top5_m.update(acc5.item(), output.size(0)) losses_m.update(loss.item(), inputs.size(0)) batch_time_m.update(time.time() - batch_start) batch_start = time.time() if last_batch or batch_idx % log_interval == 0: # If we reach the log intervel print( "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) " "Top 1 Accuracy: {top1.val:>7.4f} ({top1.avg:>7.4f}) " "Top 5 Accuracy: {top5.val:>7.4f} ({top5.avg:>7.4f})".format( batch_time=batch_time_m, loss=losses_m, top1=top1_m, top5=top5_m)) if num_batches is not None: if cnt >= num_batches: end_train_step = time.time() metrics["loss"] = losses_m.avg metrics["top1"] = top1_m.avg metrics["top5"] = top5_m.avg print(f"Done till {num_batches} train batches") print( f"Time taken for train step = {end_train_step - start_train_step} sec" ) return metrics metrics["loss"] = losses_m.avg metrics["top1"] = top1_m.avg metrics["top5"] = top5_m.avg end_train_step = time.time() print( f"Time taken for train step = {end_train_step - start_train_step} sec") return metrics
def val_step(model, val_loader, criterion, device, num_batches=None, log_interval: int = 100): """ Performs one step of validation. Calculates loss, forward pass and returns metrics. Args: model : A pytorch CNN Model. val_loader : Validation loader. criterion : Loss function to be optimized. device : "cuda" or "cpu" num_batches : (optional) Integer To limit validation to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. """ model = model.to(device) start_val_step = time.time() last_idx = len(val_loader) - 1 batch_time_m = utils.AverageMeter() # data_time_m = utils.AverageMeter() losses_m = utils.AverageMeter() top1_m = utils.AverageMeter() top5_m = utils.AverageMeter() cnt = 0 model.eval() batch_start = time.time() metrics = OrderedDict() with torch.no_grad(): for batch_idx, (inputs, target) in enumerate(val_loader): last_batch = batch_idx == last_idx inputs = inputs.to(device) target = target.to(device) output = model(inputs) if isinstance(output, (tuple, list)): output = output[0] loss = criterion(output, target) cnt += 1 acc1, acc5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.data losses_m.update(reduced_loss.item(), inputs.size(0)) top1_m.update(acc1.item(), output.size(0)) top5_m.update(acc5.item(), output.size(0)) batch_time_m.update(time.time() - batch_start) batch_start = time.time() if (last_batch or batch_idx % log_interval == 0): # If we reach the log intervel print( "Batch Inference Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " "Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) " "Top 1 Accuracy: {top1.val:>7.4f} ({top1.avg:>7.4f}) " "Top 5 Accuracy: {top5.val:>7.4f} ({top5.avg:>7.4f})". format(batch_time=batch_time_m, loss=losses_m, top1=top1_m, top5=top5_m)) if num_batches is not None: if cnt >= num_batches: end_val_step = time.time() metrics["loss"] = losses_m.avg metrics["top1"] = top1_m.avg metrics["top5"] = top5_m.avg print(f"Done till {num_batches} validation batches") print( f"Time taken for validation step = {end_val_step - start_val_step} sec" ) return metrics metrics["loss"] = losses_m.avg metrics["top1"] = top1_m.avg metrics["top5"] = top5_m.avg print("Finished the validation epoch") end_val_step = time.time() print( f"Time taken for validation step = {end_val_step - start_val_step} sec" ) return metrics
def train_step( model: nn.Module, train_loader, criterion, device: str, optimizer, scheduler=None, num_batches: int = None, log_interval: int = 100, scaler=None, ): """ Performs one step of training. Calculates loss, forward pass, computes gradient and returns metrics. Args: model : PyTorch Detr Model. train_loader : Train loader. device : "cuda" or "cpu" criterion : Detr Loss function to be optimized. optimizer : Torch optimizer to train. scheduler : Learning rate scheduler. num_batches : (optional) Integer To limit training to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. scaler: (optional) Pass torch.cuda.amp.GradScaler() for fp16 precision Training. """ model = model.to(device) criterion = criterion.to(device) start_train_step = time.time() model.train() last_idx = len(train_loader) - 1 batch_time_m = utils.AverageMeter() criterion.train() cnt = 0 batch_start = time.time() metrics = OrderedDict() total_loss = utils.AverageMeter() bbox_loss = utils.AverageMeter() giou_loss = utils.AverageMeter() labels_loss = utils.AverageMeter() for batch_idx, (inputs, targets) in enumerate(train_loader): last_batch = batch_idx == last_idx images = list(image.to(device) for image in inputs) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] optimizer.zero_grad() if scaler is not None: with amp.autocast(): outputs = model(images) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) scaler.scale(loss).backward() # Step using scaler.step() scaler.step(optimizer) # Update for next iteration scaler.update() else: outputs = model(images) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) loss.backward() optimizer.step() if scheduler is not None: scheduler.step() cnt += 1 total_loss.update(loss.item()) bbox_loss.update(loss_dict["loss_bbox"].item()) giou_loss.update(loss_dict["loss_giou"].item()) labels_loss.update(loss_dict["loss_ce"].item()) batch_time_m.update(time.time() - batch_start) batch_start = time.time() if last_batch or batch_idx % log_interval == 0: # If we reach the log intervel print( "Batch Train Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " .format(batch_time=batch_time_m, )) if num_batches is not None: if cnt >= num_batches: end_train_step = time.time() metrics["total_loss"] = total_loss.avg metrics["bbox_loss"] = bbox_loss.avg metrics["giou_loss"] = giou_loss.avg metrics["labels_loss"] = labels_loss.avg print(f"Done till {num_batches} train batches") print( f"Time taken for Training step = {end_train_step - start_train_step} sec" ) return metrics end_train_step = time.time() metrics["total_loss"] = total_loss.avg metrics["bbox_loss"] = bbox_loss.avg metrics["giou_loss"] = giou_loss.avg metrics["labels_loss"] = labels_loss.avg print( f"Time taken for Training step = {end_train_step - start_train_step} sec" ) return metrics
def val_step(model: nn.Module, val_loader, criterion, device, num_batches: int = None, log_interval: int = 100): """ Performs one step of validation. Calculates loss, forward pass and returns metrics. Args: model : PyTorch Detr Model. val_loader : Validation loader. criterion : Detr Loss function to be optimized. device : "cuda" or "cpu" num_batches : (optional) Integer To limit validation to certain number of batches. log_interval : (optional) Defualt 100. Integer to Log after specified batch ids in every batch. """ model = model.to(device) start_val_step = time.time() last_idx = len(val_loader) - 1 batch_time_m = utils.AverageMeter() cnt = 0 model.eval() criterion.eval() batch_start = time.time() metrics = OrderedDict() total_loss = utils.AverageMeter() bbox_loss = utils.AverageMeter() giou_loss = utils.AverageMeter() labels_loss = utils.AverageMeter() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(val_loader): last_batch = batch_idx == last_idx images = list(image.to(device) for image in inputs) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model(images) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) cnt += 1 total_loss.update(loss.item()) bbox_loss.update(loss_dict["loss_bbox"].item()) giou_loss.update(loss_dict["loss_giou"].item()) labels_loss.update(loss_dict["loss_ce"].item()) batch_time_m.update(time.time() - batch_start) batch_start = time.time() if last_batch or batch_idx % log_interval == 0: # If we reach the log intervel print( "Batch Validation Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) " .format(batch_time=batch_time_m, )) if num_batches is not None: if cnt >= num_batches: end_val_step = time.time() metrics["total_loss"] = total_loss.avg metrics["bbox_loss"] = bbox_loss.avg metrics["giou_loss"] = giou_loss.avg metrics["labels_loss"] = labels_loss.avg print(f"Done till {num_batches} Validation batches") print( f"Time taken for validation step = {end_val_step - start_val_step} sec" ) return metrics end_val_step = time.time() metrics["total_loss"] = total_loss.avg metrics["bbox_loss"] = bbox_loss.avg metrics["giou_loss"] = giou_loss.avg metrics["labels_loss"] = labels_loss.avg print( f"Time taken for validation step = {end_val_step - start_val_step} sec" ) return metrics