def __init__(self, dboxes, size = (300, 300), val=False): # define vgg16 mean self.size = size self.val = val self.dboxes_ = dboxes #DefaultBoxes300() self.encoder = Encoder(self.dboxes_) self.crop = SSDCropping() self.img_trans = transforms.Compose([ transforms.Resize(self.size), #transforms.ColorJitter(brightness=0.125, contrast=0.5, # saturation=0.5, hue=0.05 #), #transforms.ToTensor(), FusedColorJitter(), ToTensor(), ]) self.hflip = RandomHorizontalFlip() # All Pytorch Tensor will be normalized # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683 normalization_mean = [0.485, 0.456, 0.406] normalization_std = [0.229, 0.224, 0.225] ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN, value=normalization_mean) ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD, value=normalization_std) self.normalize = transforms.Normalize(mean=normalization_mean, std=normalization_std) self.trans_val = transforms.Compose([ transforms.Resize(self.size), transforms.ToTensor(), self.normalize,])
def dboxes300_coco(): figsize = 300 feat_size = [38, 19, 10, 5, 3, 1] ssd_print(key=mlperf_log.FEATURE_SIZES, value=feat_size) steps = [8, 16, 32, 64, 100, 300] ssd_print(key=mlperf_log.STEPS, value=steps) # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py scales = [21, 45, 99, 153, 207, 261, 315] ssd_print(key=mlperf_log.SCALES, value=scales) aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] ssd_print(key=mlperf_log.ASPECT_RATIOS, value=aspect_ratios) dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios) ssd_print(key=mlperf_log.NUM_DEFAULTS, value=len(dboxes.default_boxes)) return dboxes
def main(): args = parse_args() validate_arguments(args) if args.local_rank == 0: if not os.path.isdir('./models'): os.mkdir('./models') torch.backends.cudnn.benchmark = True success = train300_mlperf_coco(args) # end timing here ssd_print(key=mlperf_log.RUN_STOP, value={"success": success}) ssd_print(key=mlperf_log.RUN_FINAL)
def lr_warmup(optim, warmup_iter, iter_num, epoch, base_lr, args): if iter_num < warmup_iter: # new_lr = 1. * base_lr / warmup_iter * iter_num # mlperf warmup rule warmup_step = base_lr / (warmup_iter * (2**args.warmup_factor)) new_lr = base_lr - (warmup_iter - iter_num) * warmup_step ssd_print(key=mlperf_log.OPT_LR, value={ "epoch": epoch, "iteration": iter_num, "value": new_lr }) for param_group in optim.param_groups: param_group['lr'] = new_lr
def __init__(self): self.sample_options = ( # Do nothing None, # min IoU, max IoU (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), # no IoU requirements (None, None), ) # Implementation uses 1 iteration to find a possible candidate, this # was shown to produce the same mAP as using more iterations. self.num_cropping_iterations = 1 ssd_print(key=mlperf_log.NUM_CROPPING_ITERATIONS, value=self.num_cropping_iterations)
def __init__(self, num_cropping_iterations=1): self.sample_options = ( # Do nothing None, # min IoU, max IoU (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), # no IoU requirements (None, None), ) # Implementation uses 1 iteration to find a possible candidate, this # was shown to produce the same mAP as using more iterations. self.num_cropping_iterations = num_cropping_iterations ssd_print(key=mllog_const.MAX_SAMPLES, value=self.num_cropping_iterations, sync=False)
def __init__(self, label_num, backbone='resnet34', use_nhwc=False, pad_input=False): super(SSD300, self).__init__() self.label_num = label_num self.use_nhwc = use_nhwc self.pad_input = pad_input if backbone == 'resnet18': self.model = ResNet18(self.use_nhwc, self.pad_input) out_channels = 256 out_size = 38 self.out_chan = [out_channels, 512, 512, 256, 256, 128] elif backbone == 'resnet34': self.model = ResNet34(self.use_nhwc, self.pad_input) ssd_print(key=mlperf_log.BACKBONE, value='resnet34') out_channels = 256 out_size = 38 self.out_chan = [out_channels, 512, 512, 256, 256, 256] ssd_print(key=mlperf_log.LOC_CONF_OUT_CHANNELS, value=self.out_chan) elif backbone == 'resnet50': self.model = ResNet50(self.use_nhwc, self.pad_input) out_channels = 1024 out_size = 38 self.l2norm4 = L2Norm() self.out_chan = [out_channels, 1024, 512, 512, 256, 256] else: print('Invalid backbone chosen') self._build_additional_features(out_size, self.out_chan) # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2 # classifer 1, 2, 3, 4, 5 ,6 self.num_defaults = [4, 6, 6, 6, 4, 4] ssd_print(key=mlperf_log.NUM_DEFAULTS_PER_CELL, value=self.num_defaults) self.loc = [] self.conf = [] for nd, oc in zip(self.num_defaults, self.out_chan): self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1)) self.conf.append(nn.Conv2d(oc, nd*label_num, kernel_size=3, padding=1)) self.loc = nn.ModuleList(self.loc) self.conf = nn.ModuleList(self.conf) # intitalize all weights self._init_weights()
def __init__(self, label_num, backbone='resnet34', model_path="./resnet34-333f7ec4.pth"): super(SSD300, self).__init__() self.label_num = label_num if backbone == 'resnet34': self.model = ResNet34() ssd_print(key=mlperf_log.BACKBONE, value='resnet34') out_channels = 256 out_size = 38 self.out_chan = [out_channels, 512, 512, 256, 256, 256] ssd_print(key=mlperf_log.LOC_CONF_OUT_CHANNELS, value=self.out_chan) else: raise ValueError('Invalid backbone chosen') self._build_additional_features(out_size, self.out_chan) # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2 # classifer 1, 2, 3, 4, 5 ,6 self.num_defaults = [4, 6, 6, 6, 4, 4] ssd_print(key=mlperf_log.NUM_DEFAULTS_PER_CELL, value=self.num_defaults) self.loc = [] self.conf = [] for nd, oc in zip(self.num_defaults, self.out_chan): self.loc.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1)) self.conf.append( nn.Conv2d(oc, nd * label_num, kernel_size=3, padding=1)) self.loc = nn.ModuleList(self.loc) self.conf = nn.ModuleList(self.conf) # intitalize all weights self._init_weights()
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist # ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 print(dist.get_rank(), "Using seed = {}".format(local_seed)) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader ssd_print(key=mlperf_log.INPUT_SHARD, value=None) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size current_lr = args.lr * (global_batch_size / 32) current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) eval_points = args.evaluation print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() warmup_step(iter_num, current_lr) optim.step() iter_num += 1 if epoch + 1 in eval_points: rank = dist.get_rank() if args.distributed else args.local_rank if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if rank == 0: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True return False
def coco_eval(model, coco, cocoGt, encoder, inv_map, threshold, epoch, iteration, use_cuda=True): from pycocotools.cocoeval import COCOeval print("") model.eval() if use_cuda: model.cuda() ret = [] overlap_threshold = 0.50 nms_max_detections = 200 ssd_print(key=mlperf_log.NMS_THRESHOLD, value=overlap_threshold, sync=False) ssd_print(key=mlperf_log.NMS_MAX_DETECTIONS, value=nms_max_detections, sync=False) ssd_print(key=mlperf_log.EVAL_START, value=epoch, sync=False) start = time.time() for idx, image_id in enumerate(coco.img_keys): img, (htot, wtot), _, _ = coco[idx] with torch.no_grad(): print("Parsing image: {}/{}".format(idx + 1, len(coco)), end="\r") inp = img.unsqueeze(0) if use_cuda: inp = inp.cuda() ploc, plabel = model(inp) try: result = encoder.decode_batch(ploc, plabel, overlap_threshold, nms_max_detections)[0] except: #raise print("") print("No object detected in idx: {}".format(idx)) continue loc, label, prob = [r.cpu().numpy() for r in result] for loc_, label_, prob_ in zip(loc, label, prob): ret.append([image_id, loc_[0]*wtot, \ loc_[1]*htot, (loc_[2] - loc_[0])*wtot, (loc_[3] - loc_[1])*htot, prob_, inv_map[label_]]) print("") print("Predicting Ended, total time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(np.array(ret)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() print("Current AP: {:.5f} AP goal: {:.5f}".format(E.stats[0], threshold)) # put your model back into training mode model.train() current_accuracy = E.stats[0] ssd_print(key=mlperf_log.EVAL_SIZE, value=idx + 1, sync=False) ssd_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": current_accuracy }, sync=False) ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY, value={ "iteration": iteration, "value": current_accuracy }, sync=False) ssd_print(key=mlperf_log.EVAL_TARGET, value=threshold, sync=False) ssd_print(key=mlperf_log.EVAL_STOP, value=epoch, sync=False) return current_accuracy >= threshold #Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ]
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError("Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader if args.rank==0: val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=4) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format(fragment_size)) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v:k for k,v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu*args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup(optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start( key=mllog_const.BLOCK_START, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) optim.zero_grad() for epoch in range(args.epochs): mllogger.start( key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format(num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] trans_bbox = fbbox.transpose(1,2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() fimg = Variable(fimg, requires_grad=True) ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size) # weighted mean loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad() if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if not args.no_save: print("") print("saving model...") torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end( key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end( key=mllog_const.BLOCK_STOP, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) return False
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here ssd_print(key=mlperf_log.RUN_START) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = DALICOCOIterator(train_pipe, 118287 / N_gpu) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) # Build the model ssd300 = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 1e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = 5e-4 static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Add LARC if desired if args.use_larc: optim = LARC(optim) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: ssd_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=args.warmup) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() if args.jit: input_c = 4 if args.pad_input else 3 example_shape = [ args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): for model, master in zip( model_bucket, apex_C.unflatten(flat_master.data, model_bucket)): model.data.copy_(master.data) ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for p in ssd300.parameters(): p.grad = None for i, data in enumerate(train_loader): img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] # handle random flipping outside of DALI for now bbox_offsets = bbox_offsets.cuda() img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, args.nhwc) img.sub_(mean).div_(std) if args.profile is not None and iter_num == args.profile: return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() # NHWC direct from DALI now if necessary bbox = bbox.cuda() label = label.cuda() bbox_offsets = bbox_offsets.cuda() # Now run the batched box encoder N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) # print(img.shape, bbox.shape, label.shape) ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() trans_bbox = bbox.transpose(1, 2).contiguous().cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: if args.local_rank == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) if coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input): return True iter_num += 1 train_loader.reset() return False
def coco_eval(model, coco, cocoGt, encoder, inv_map, threshold, epoch, iteration, batch_size, use_cuda=True, use_fp16=False, local_rank=-1, N_gpu=1, use_nhwc=False, pad_input=False): from pycocotools.cocoeval import COCOeval print("") distributed = False if local_rank >= 0: distributed = True ret = [] overlap_threshold = 0.50 nms_max_detections = 200 ssd_print(key=mlperf_log.NMS_THRESHOLD, value=overlap_threshold) ssd_print(key=mlperf_log.NMS_MAX_DETECTIONS, value=nms_max_detections) ssd_print(key=mlperf_log.EVAL_START, value=epoch) start = time.time() for nbatch, (img, img_id, img_size, _, _) in enumerate(coco): print("Parsing batch: {}/{}".format(nbatch, len(coco)), end='\r') with torch.no_grad(): inp = img.cuda() if pad_input: s = inp.shape inp = torch.cat([ inp, torch.zeros([s[0], 1, s[2], s[3]], device=inp.device) ], dim=1) if use_nhwc: inp = inp.permute(0, 2, 3, 1).contiguous() if use_fp16: inp = inp.half() # Get predictions ploc, plabel = model(inp) ploc, plabel = ploc.float(), plabel.float() # Handle the batch of predictions produced # This is slow, but consistent with old implementation. for idx in range(ploc.shape[0]): # ease-of-use for specific predictions ploc_i = ploc[idx, :, :].unsqueeze(0) plabel_i = plabel[idx, :, :].unsqueeze(0) try: result = encoder.decode_batch(ploc_i, plabel_i, overlap_threshold, nms_max_detections)[0] except: #raise print("No object detected in idx: {}".format(idx)) continue htot, wtot = img_size[0][idx].item(), img_size[1][idx].item() loc, label, prob = [r.cpu().numpy() for r in result] for loc_, label_, prob_ in zip(loc, label, prob): ret.append([img_id[idx], loc_[0]*wtot, \ loc_[1]*htot, (loc_[2] - loc_[0])*wtot, (loc_[3] - loc_[1])*htot, prob_, inv_map[label_]]) # Now we have all predictions from this rank, gather them all together # if necessary ret = np.array(ret).astype(np.float32) # Multi-GPU eval if distributed: # NCCL backend means we can only operate on GPU tensors ret_copy = torch.tensor(ret).cuda() # Everyone exchanges the size of their results ret_sizes = [torch.tensor(0).cuda() for _ in range(N_gpu)] torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda()) # Get the maximum results size, as all tensors must be the same shape for # the all_gather call we need to make max_size = 0 sizes = [] for s in ret_sizes: max_size = max(max_size, s.item()) sizes.append(s.item()) # Need to pad my output to max_size in order to use in all_gather ret_pad = torch.cat([ ret_copy, torch.zeros(max_size - ret_copy.shape[0], 7, dtype=torch.float32).cuda() ]) # allocate storage for results from all other processes other_ret = [ torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(N_gpu) ] # Everyone exchanges (padded) results torch.distributed.all_gather(other_ret, ret_pad) # Now need to reconstruct the _actual_ results from the padded set using slices. cat_tensors = [] for i in range(N_gpu): cat_tensors.append(other_ret[i][:sizes[i]][:]) final_results = torch.cat(cat_tensors).cpu().numpy() torch.cuda.set_device(local_rank) device = torch.device('cuda') # eval size per worker eval_tensor = torch.LongTensor([ (len(coco) - 1) * batch_size + ploc.shape[0] ]).to(device) torch.distributed.all_reduce(eval_tensor) eval_size = eval_tensor.item() else: # Otherwise full results are just our results final_results = ret eval_size = (len(coco) - 1) * batch_size + ploc.shape[0] print_message( local_rank, "Predicting Ended, total time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(final_results) if local_rank == 0 or local_rank == -1: E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() print("Current AP: {:.5f} AP goal: {:.5f}".format( E.stats[0], threshold)) else: # fix for cocoeval indiscriminate prints with redirect_stdout(io.StringIO()): E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() current_accuracy = E.stats[0] ssd_print(key=mlperf_log.EVAL_SIZE, value=eval_size) ssd_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": current_accuracy }) ssd_print(key=mlperf_log.EVAL_ITERATION_ACCURACY, value={ "epoch": epoch, "value": current_accuracy }) ssd_print(key=mlperf_log.EVAL_TARGET, value=threshold) ssd_print(key=mlperf_log.EVAL_STOP, value=epoch) return current_accuracy >= threshold #Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ]
def coco_eval(model, val_dataloader, cocoGt, encoder, inv_map, threshold, epoch, iteration, log_interval=100, use_cuda=False, nms_valid_thresh=0.05, use_autocast=False): from pycocotools.cocoeval import COCOeval print("") model.eval() if use_cuda: model.cuda() ret = [] overlap_threshold = 0.50 nms_max_detections = 200 print("nms_valid_thresh is set to {}".format(nms_valid_thresh)) mllogger.start(key=mllog_const.EVAL_START, metadata={mllog_const.EPOCH_NUM: epoch}) start = time.time() for nbatch, (img, img_id, img_size, bbox, label) in enumerate(val_dataloader): with torch.no_grad(): if use_cuda: img = img.cuda() # img to nhwc img = img.contiguous(memory_format=torch.channels_last) if use_autocast: with torch.cpu.amp.autocast(enabled=use_autocast): ploc, plabel = model(img) ploc = ploc.to(torch.float32) plabel = plabel.to(torch.float32) else: ploc, plabel = model(img) try: results = encoder.decode_batch( ploc, plabel, overlap_threshold, nms_max_detections, nms_valid_thresh=nms_valid_thresh) except: #raise print("") print("No object detected in batch: {}".format(nbatch)) continue (htot, wtot) = [d.cpu().numpy() for d in img_size] img_id = img_id.cpu().numpy() # Iterate over batch elements for img_id_, wtot_, htot_, result in zip(img_id, wtot, htot, results): loc, label, prob = [r.cpu().numpy() for r in result] # Iterate over image detections for loc_, label_, prob_ in zip(loc, label, prob): ret.append([img_id_, loc_[0]*wtot_, \ loc_[1]*htot_, (loc_[2] - loc_[0])*wtot_, (loc_[3] - loc_[1])*htot_, prob_, inv_map[label_]]) if log_interval and not (nbatch + 1) % log_interval: print("Completed inference on batch: {}".format(nbatch + 1)) print("") print("Predicting Ended, total time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(np.array(ret)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') E.evaluate() E.accumulate() E.summarize() print("Current AP: {:.5f} AP goal: {:.5f}".format(E.stats[0], threshold)) # put your model back into training mode model.train() current_accuracy = E.stats[0] ssd_print(key=mllog_const.EVAL_ACCURACY, value=current_accuracy, metadata={mllog_const.EPOCH_NUM: epoch}, sync=False) mllogger.end(key=mllog_const.EVAL_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) return current_accuracy >= threshold #Average Precision (AP) @[ IoU=050:0.95 | area= all | maxDets=100 ]
def __init__(self, p=0.5): self.p = p ssd_print(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=self.p)
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed os.environ['USE_CUDA'] = str(use_cuda) if args.world_size > 1: args.distributed = True if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist print('Distributed training with DDP') if args.no_cuda: device = torch.device('cpu') os.environ['RANK'] = str(os.environ.get('PMI_RANK', args.rank)) os.environ['WORLD_SIZE'] = str( os.environ.get('PMI_SIZE', args.world_size)) os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group(backend=args.backend) else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) # Refer to https://pytorch.org/docs/stable/notes/randomness.html#dataloader torch.manual_seed(local_seed) # Set PyTorch seed np.random.seed(seed=local_seed) # Set Numpy seed random.seed(local_seed) # Set the Python seed args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=0) # set shuffle=True in DataLoader # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=0) ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) # Model to NHWC ssd300 = ssd300.to(memory_format=torch.channels_last) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: train_time = AverageMeter('TrainTime', ':6.3f') progress = ProgressMeter(args.train_iteration, [train_time], prefix='Train: ') # Restore the model and optim from checkpoint if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) optim.load_state_dict(od['optim']) # Model Prepack if use_ipex: if args.autocast: ssd300, optim = ipex.optimize(ssd300, dtype=torch.bfloat16, optimizer=optim) else: ssd300, optim = ipex.optimize(ssd300, dtype=torch.float32, optimizer=optim) # parallelize if args.distributed: device_ids = None ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, device_ids=device_ids) optim.zero_grad(set_to_none=True) for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): naive_train_case = True # img.shape[0] == fragment_size if naive_train_case: # Naive train case fimg, gloc, glabel, mask, pos_num, neg_num, num_mask = data_preprocess( img, bbox, label, loss_func, args.autocast) if args.performance_only and iter_num >= args.warmup_iterations: start_time = time.time() if args.profile and args.performance_only and iter_num == 30: # Profile Mode with torch.profiler.profile( on_trace_ready=trace_handler) as prof: with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Non Profile Mode with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Train case: when split input to several fragment size print("Not support input with several fragment size yet.") exit(-1) # current_batch_size = img.shape[0] # # Split batch for gradient accumulation # img = torch.split(img, fragment_size) # bbox = torch.split(bbox, fragment_size) # label = torch.split(label, fragment_size) # if args.performance_only and iter_num >= args.warmup_iterations: # start_time=time.time() # for (fimg, fbbox, flabel) in zip(img, bbox, label): # current_fragment_size = fimg.shape[0] # trans_bbox = fbbox.transpose(1,2).contiguous() # if use_cuda: # fimg = fimg.cuda() # trans_bbox = trans_bbox.cuda() # flabel = flabel.cuda() # fimg = Variable(fimg, requires_grad=True) # gloc, glabel = Variable(trans_bbox, requires_grad=False), \ # Variable(flabel, requires_grad=False) # gloc = loss_func._loc_vec(gloc) # mask = glabel > 0 # pos_num = mask.sum(dim=1) # neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1) # num_mask = (pos_num > 0).float() # # image to NHWC # fimg = fimg.contiguous(memory_format=torch.channels_last) # if use_ipex: # with ipex.amp.autocast(enabled=args.autocast, configure=ipex.conf.AmpConf(torch.bfloat16)): # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # else: # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # loss = loss * (current_fragment_size / current_batch_size) # weighted mean # loss.backward() # warmup_step(iter_num, current_lr) # optim.step() # optim.zero_grad(set_to_none=True) if args.performance_only and iter_num >= args.warmup_iterations: train_time.update(time.time() - start_time) if args.performance_only and iter_num % args.print_freq == 0: progress.display(iter_num) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.8f}, Average Loss: {:.8f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if args.performance_only and iter_num >= args.train_iteration: break if args.performance_only and iter_num >= args.train_iteration: break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf.cpu().detach().numpy()) if args.rank == 0 or True: # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info, "optim": optim.state_dict() }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh, use_autocast=args.autocast): success = torch.ones(1) if use_cuda: success = success.cuda() # Leslie: same Workaround: since we run evalution on all ranks, we don't need to broadcast the evalutation result # if args.distributed: # dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: batch_size = args.batch_size latency = train_time.avg / batch_size * 1000 perf = batch_size / train_time.avg print('train latency %.2f ms' % latency) print('train performance %.2f fps' % perf) print("Throughput: {:.3f} fps".format(perf)) return False
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if args.use_hpu: if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.world_size = int(os.environ['WORLD_SIZE']) print("world_size = {}".format(args.world_size)) print("distributed={}".format(args.distributed)) if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") use_hpu = args.use_hpu hpu_channels_last = args.hpu_channels_last hpu_lazy_mode = args.hpu_lazy_mode is_hmp = args.is_hmp device = torch.device('cpu') data_loader_type = DataLoader if use_hpu: device = torch.device('hpu') if args.distributed: os.environ["MAX_WAIT_ATTEMPTS"] = "90" if hpu_lazy_mode: os.environ["PT_HPU_LAZY_MODE"] = "1" else: os.environ["PT_HPU_LAZY_MODE"] = "2" if is_hmp: if not args.hmp_bf16: raise IOError("Please provide list of BF16 ops") if not args.hmp_fp32: raise IOError("Please provide list of FP32 ops") from habana_frameworks.torch.hpex import hmp hmp.convert(opt_level=args.hmp_opt_level, bf16_file_path=args.hmp_bf16, fp32_file_path=args.hmp_fp32, isVerbose=args.hmp_verbose) from habana_frameworks.torch.utils.library_loader import load_habana_module load_habana_module() # TODO - add dataloader local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if use_hpu: args.dist_backend = 'hccl' import habana_frameworks.torch.core.hccl os.environ["ID"] = os.environ["RANK"] dist.init_process_group(args.dist_backend, init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device, use_hpu=True) local_seed = (args.seed + dist.get_rank()) % 2**32 elif args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) random.seed(local_seed) # amorgenstern torch.cuda.manual_seed(local_seed) # amorgenstern args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) if use_hpu and is_hmp: with hmp.disable_casts(): dboxes = dboxes300_coco() encoder = Encoder(dboxes) else: dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 if use_hpu and is_hmp: with hmp.disable_casts(): train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) else: train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") if use_hpu and is_hmp: with hmp.disable_casts(): cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) else: cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None if use_hpu: # patch torch cuda functions that are being unconditionally invoked # in the multiprocessing data loader torch.cuda.current_device = lambda: None torch.cuda.set_device = lambda x: None train_dataloader = data_loader_type(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers) # set shuffle=True in DataLoader if args.rank == 0: val_dataloader = data_loader_type(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=args.num_workers) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=torch.device('cpu')) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() if use_hpu and is_hmp: with hmp.disable_casts(): loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) else: loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) if use_cuda: loss_func.cuda() if use_hpu: ssd300.to(device) loss_func.to(device) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) current_momentum = 0.9 sgd_optimizer = torch.optim.SGD if use_hpu and hpu_lazy_mode: from habana_frameworks.torch.hpex.optimizers import FusedSGD sgd_optimizer = FusedSGD optim = sgd_optimizer(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) if use_hpu: permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) permute_momentum(optimizer=optim, to_filters_last=True, lazy_mode=hpu_lazy_mode) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) # parallelize if args.distributed: if use_hpu: ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, bucket_cap_mb=100, broadcast_buffers=False, gradient_as_bucket_view=True) else: ssd300 = DDP(ssd300) iter_num = args.iteration end_iter_num = args.end_iteration if end_iter_num: print("--end-iteration set to: {}".format(end_iter_num)) assert end_iter_num > iter_num, "--end-iteration must have a value > --iteration" avg_loss = 0.0 if use_hpu: loss_iter = list() inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) optim.zero_grad(set_to_none=True) if use_hpu: start = time.time() for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] if not use_hpu: trans_bbox = fbbox.transpose(1, 2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() if use_hpu: fimg = fimg.to(device) if hpu_channels_last: fimg = fimg.contiguous( memory_format=torch.channels_last) if hpu_lazy_mode: mark_step() if is_hmp: with hmp.disable_casts(): #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) else: #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) fimg = Variable(fimg, requires_grad=True) if args.lowp: # amorgenstern import lowp with lowp.Lowp(mode='BF16', warn_patched=True, warn_not_patched=True): ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) else: ploc, plabel = ssd300(fimg) if use_hpu and is_hmp: with hmp.disable_casts(): gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc.float(), plabel.float(), gloc, glabel) else: gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size ) # weighted mean if use_hpu and hpu_lazy_mode and args.distributed: mark_step() loss.backward() if use_hpu and hpu_lazy_mode: mark_step() warmup_step(iter_num, current_lr) if use_hpu and is_hmp: with hmp.disable_casts(): optim.step() else: optim.step() optim.zero_grad(set_to_none=True) if use_hpu: loss_iter.append(loss.clone().detach()) else: if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if use_hpu and hpu_lazy_mode: mark_step() if use_hpu: if args.log_interval and not iter_num % args.log_interval: cur_loss = 0.0 for i, x in enumerate(loss_iter): cur_loss = x.cpu().item() if not np.isinf(cur_loss): avg_loss = 0.999 * avg_loss + 0.001 * cur_loss if args.rank == 0: print("Rank: {:6d}, Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(args.rank, iter_num, cur_loss, avg_loss)) loss_iter = list() else: if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if use_hpu and iter_num == 50: start = time.time() if end_iter_num and iter_num >= end_iter_num: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) if not args.no_save: print("") print("saving model...") if use_hpu: permute_params(model=ssd300, to_filters_last=False, lazy_mode=hpu_lazy_mode) ssd300_copy = SSD300( train_coco.labelnum, model_path=args.pretrained_backbone) if args.distributed: ssd300_copy.load_state_dict( ssd300.module.state_dict()) else: ssd300_copy.load_state_dict(ssd300.state_dict()) torch.save( { "model": ssd300_copy.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) else: torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, use_cuda=use_cuda, use_hpu=use_hpu, hpu_device=device, is_hmp=is_hmp, hpu_channels_last=hpu_channels_last, hpu_lazy_mode=hpu_lazy_mode, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) return False