def im_detect_ratio(net, im, target_size1, target_size2): device = net.arm_conf[0].weight.device h, w, _ = im.shape scale = torch.Tensor([w, h, w, h]) scale = scale.to(device) im_orig = im.astype(np.float32, copy=True) if im_orig.shape[0] < im_orig.shape[1]: target_size1, target_size2 = target_size2, target_size1 im = cv2.resize(im_orig, None, None, fx=float(target_size2) / float(w), fy=float(target_size1) / float(h), interpolation=cv2.INTER_LINEAR) x = (im - MEANS).astype(np.float32) x = x[:, :, (2, 1, 0)] # to rgb x = x.transpose(2, 0, 1) x = torch.from_numpy(x).unsqueeze(0) x = x.to(device) arm_loc, arm_conf, adm_loc, adm_conf, feat_sizes = net(x) priorbox = PriorBox(net.cfg, feat_sizes, (target_size1, target_size2), phase='test') priors = priorbox.forward() priors = priors.to(device) det = detect.forward(arm_loc, arm_conf, adm_loc, adm_conf, priors, scale) return det
def __init__(self, config, phase, base, extras, head, num_classes, top_k=200): super(SSD, self).__init__() self.phase = phase self.num_classes = num_classes # TODO: implement __call__ in PriorBox self.priorbox = PriorBox(config) self.priors = Variable(self.priorbox.forward(), volatile=True) # SSD network self.vgg = nn.ModuleList(base) # Layer learns to scale the l2 normalized features from conv4_3 self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) if phase == 'test': self.softmax = nn.Softmax() self.detect = Detect(num_classes, 0, top_k, 0.01, 0.45, variance=config['variance'])
def im_detect(net, im, target_size): try: device = net.arm_conf[0].weight.device except: device = net.odm_conf[0].weight.device h, w, _ = im.shape scale = torch.Tensor([w, h, w, h]) scale = scale.to(device) im_orig = im.astype(np.float32, copy=True) im = cv2.resize(im_orig, (target_size, target_size), interpolation=cv2.INTER_LINEAR) x = (im - MEANS).astype(np.float32) x = x[:, :, (2, 1, 0)] # to rgb x = x.transpose(2, 0, 1) x = torch.from_numpy(x).unsqueeze(0) x = x.to(device) if args.wo_refined_anchor: adm_loc, adm_conf, feat_sizes = net(x) else: arm_loc, arm_conf, adm_loc, adm_conf, feat_sizes = net(x) priorbox = PriorBox(net.cfg, feat_sizes, (target_size, target_size), phase='test') priors = priorbox.forward() priors = priors.to(device) if args.wo_refined_anchor: det = detect.forward(adm_loc, adm_conf, priors, scale) else: det = detect.forward(arm_loc, arm_conf, adm_loc, adm_conf, priors, scale) return det
def __init__(self, phase, size, base, extras, head, num_classes): super(SSD, self).__init__() self.phase = phase self.num_classes = num_classes self.cfg = (coco, voc)[num_classes == 21] self.priorbox = PriorBox(self.cfg) self.priors = Variable(self.priorbox.forward(), volatile=True) self.size = size # SSD network self.vgg = nn.ModuleList(base) # Layer learns to scale the l2 normalized features from conv4_3 self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) if phase == 'test': self.softmax = nn.Softmax(dim=-1) self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
def __init__(self, num_classes, phase, pretrain=False, finetune=None): super(SSD300, self).__init__() self.num_classes = num_classes self.phase = phase self.base_net = self._base_net() self.extra_net = self._extra_net() self.loc_pred, self.cls_pred = self._predict_net() self.L2Norm = L2Norm(512, 20) self.priorbox = PriorBox(v2) self.priors = Variable(self.priorbox.forward(), volatile=True) if phase == 'test': self.softmax = nn.Softmax() self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) self._init_weight() if pretrain: self._load_weight() if finetune is not None: self._finetune(finetune)
def __init__(self, phase, size, base, extras, head, num_classes): super(TBPP, self).__init__() self.phase = phase self.num_classes = num_classes self.cfg = { 'num_classes': 2, 'lr_steps': (80000, 100000, 120000), 'max_iter': 120000, 'feature_maps': [64, 32, 16, 8, 4, 2, 1], 'min_dim': 512, 'steps': [8, 16, 32, 64, 128, 256, 512], 'min_sizes': [20, 51, 133, 215, 296, 378, 460], 'max_sizes': [51, 133, 215, 296, 378, 460, 542], 'aspect_ratios': [[2, 3], [2, 3, 5], [2, 3, 5], [2, 3, 5], [2, 3, 5], [2, 3], [2, 3]], # TODO 'variance': [0.1, 0.2], 'clip': True, 'name': 'MINE' } self.priorbox = PriorBox( self.cfg) # calculate the size of prior boxes, i.e. defaults boxes self.priors = Variable(self.priorbox.forward(), volatile=True) self.size = size # TBPP network self.vgg = nn.ModuleList(base) self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) if phase == 'test': self.softmax = nn.Softmax(dim=-1) self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
def __init__(self, num_classes): super(DSOD_64_16_1x1, self).__init__() self.num_classes = num_classes self.extractor = DenseNet_64_16_DSSD_s_Pred_D() self.loc_layers = nn.ModuleList() self.cls_layers = nn.ModuleList() self.cfg = cfg_320_64_16 self.priorbox = PriorBox(self.cfg) self.priors = self.priorbox.forward() # in_channels = (768, 768, 768, 256, 256, 256) #pred C in_channels = (256, 256, 256, 256, 256, 256) # pred D num_anchors = (4, 6, 6, 6, 4, 4) for inC, num_anchor in zip(in_channels, num_anchors): # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)] # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1) # ] self.loc_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * 4, kernel_size=1, padding=0, bias=False), nn.BatchNorm2d(num_anchor * 4)) ] self.cls_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * num_classes, kernel_size=1, padding=0, bias=False), nn.BatchNorm2d(num_anchor * num_classes)) ] self.normalize = nn.ModuleList( [L2Norm(chan, 20) for chan in in_channels]) self.reset_parameters()
def __init__(self, num_classes): super(DSOD_64_16_GN, self).__init__() self.num_classes = num_classes self.extractor = DSSD_s_GN() self.loc_layers = nn.ModuleList() self.cls_layers = nn.ModuleList() self.cfg = cfg_320_64_16 self.priorbox = PriorBox(self.cfg) self.priors = self.priorbox.forward() in_channels = channel_dict['DSSD'] num_anchors = (4, 6, 6, 6, 4, 4) for inC, num_anchor in zip(in_channels, num_anchors): # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)] # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1) # ] self.loc_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * 4, kernel_size=3, padding=1, bias=False), nn.GroupNorm(4, num_anchor * 4)) ] self.cls_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * num_classes, kernel_size=3, padding=1, bias=False), nn.GroupNorm(num_classes, num_anchor * num_classes)) ] self.normalize = nn.ModuleList( [L2Norm(chan, 20) for chan in in_channels]) self.reset_parameters()
def forward(self, x): img_size = x.size()[2:] source = [] x = self.conv1(x) x = self.bn1(x) x = F.relu(torch.cat((F.relu(x), F.relu(-x)), 1)) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) x = self.conv2(x) x = self.bn2(x) x = F.relu(torch.cat((F.relu(x), F.relu(-x)), 1)) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) x = self.inception1(x) x = self.inception2(x) x = self.inception3(x) source.append(x) x = self.conv3_1(x) x = self.conv3_2(x) source.append(x) x = self.conv4_1(x) x = self.conv4_2(x) source.append(x) feature_maps = [] for feat in source: feature_maps.append([feat.size(2), feat.size(3)]) self.priors = Variable(PriorBox(img_size, feature_maps, cfg).forward()) loc_preds, conf_preds = self.multilbox(source) if self.phase == 'test': output = self.test_det(loc_preds, self.softmax(conf_preds), self.priors) else: output = (loc_preds, conf_preds, self.priors) return output
def train(): if args.visdom: import visdom viz = visdom.Visdom() print('Loading the dataset...') if args.dataset == 'COCO': if args.dataset_root == VOC_ROOT: if not os.path.exists(COCOroot): parser.error('Must specify dataset_root if specifying dataset') print("WARNING: Using default COCO dataset_root because " + "--dataset_root was not specified.") args.dataset_root = COCOroot cfg = coco_refinedet[args.input_size] train_sets = [('train2017')] # train_sets = [('train2017', 'val2017')] dataset = COCODetection(COCOroot, train_sets, SSDAugmentation(cfg['min_dim'], MEANS)) elif args.dataset == 'VOC': '''if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root')''' cfg = voc_refinedet[args.input_size] dataset = VOCDetection(root=VOC_ROOT, transform=SSDAugmentation( cfg['min_dim'], MEANS)) print('Training RefineDet on:', dataset.name) print('Using the specified args:') print(args) refinedet_net = build_refinedet('train', int(args.input_size), cfg['num_classes'], backbone_dict) net = refinedet_net print(net) device = torch.device('cuda:0' if args.cuda else 'cpu') if args.ngpu > 1 and args.cuda: net = torch.nn.DataParallel(refinedet_net, device_ids=list(range(args.ngpu))) cudnn.benchmark = True net = net.to(device) if args.resume: print('Resuming training, loading {}...'.format(args.resume)) state_dict = torch.load(args.resume) # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): head = k[:7] if head == 'module.': name = k[7:] # remove `module.` else: name = k new_state_dict[name] = v refinedet_net.load_state_dict(new_state_dict) else: print('Initializing weights...') refinedet_net.init_weights(pretrained=pretrained) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) arm_criterion = RefineDetMultiBoxLoss(2, 0.5, True, 0, True, negpos_ratio, 0.5, False, args.cuda) odm_criterion = RefineDetMultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, negpos_ratio, 0.5, False, args.cuda, use_ARM=True) priorbox = PriorBox(cfg) with torch.no_grad(): priors = priorbox.forward() priors = priors.to(device) net.train() # loss counters arm_loc_loss = 0 arm_conf_loss = 0 odm_loc_loss = 0 odm_conf_loss = 0 epoch = 0 + args.resume_epoch epoch_size = math.ceil(len(dataset) / args.batch_size) max_iter = args.max_epoch * epoch_size stepvalues = (args.max_epoch * 2 // 3 * epoch_size, args.max_epoch * 8 // 9 * epoch_size, args.max_epoch * epoch_size) if args.dataset == 'VOC': stepvalues = (args.max_epoch * 2 // 3 * epoch_size, args.max_epoch * 5 // 6 * epoch_size, args.max_epoch * epoch_size) step_index = 0 if args.resume_epoch > 0: start_iter = args.resume_epoch * epoch_size for step in stepvalues: if step < start_iter: step_index += 1 else: start_iter = 0 if args.visdom: vis_title = 'RefineDet.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot(viz, 'Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot(viz, 'Epoch', 'Loss', vis_title, vis_legend) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) for iteration in range(start_iter, max_iter): if iteration % epoch_size == 0: if args.visdom and iteration != 0: update_vis_plot(viz, epoch, arm_loc_loss, arm_conf_loss, epoch_plot, None, 'append', epoch_size) # reset epoch loss counters arm_loc_loss = 0 arm_conf_loss = 0 odm_loc_loss = 0 odm_conf_loss = 0 # create batch iterator batch_iterator = iter(data_loader) if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch > (args.max_epoch * 2 // 3)): torch.save( net.state_dict(), args.save_folder + 'RefineDet' + args.input_size + '_' + args.dataset + '_epoches_' + repr(epoch) + '.pth') epoch += 1 t0 = time.time() if iteration in stepvalues: step_index += 1 lr = adjust_learning_rate(optimizer, args.gamma, epoch, step_index, iteration, epoch_size) # load train data images, targets = next(batch_iterator) images = images.to(device) targets = [ann.to(device) for ann in targets] # for an in targets: # for instance in an: # for cor in instance[:-1]: # if cor < 0 or cor > 1: # raise StopIteration # forward out = net(images) # backprop optimizer.zero_grad() arm_loss_l, arm_loss_c = arm_criterion(out, priors, targets) odm_loss_l, odm_loss_c = odm_criterion(out, priors, targets) arm_loss = arm_loss_l + arm_loss_c odm_loss = odm_loss_l + odm_loss_c loss = arm_loss + odm_loss loss.backward() optimizer.step() arm_loc_loss += arm_loss_l.item() arm_conf_loss += arm_loss_c.item() odm_loc_loss += odm_loss_l.item() odm_conf_loss += odm_loss_c.item() t1 = time.time() batch_time = t1 - t0 eta = int(batch_time * (max_iter - iteration)) print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || ARM_L Loss: {:.4f} ARM_C Loss: {:.4f} ODM_L Loss: {:.4f} ODM_C Loss: {:.4f} loss: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}'.\ format(epoch, args.max_epoch, (iteration % epoch_size) + 1, epoch_size, iteration + 1, max_iter, arm_loss_l.item(), arm_loss_c.item(), odm_loss_l.item(), odm_loss_c.item(), loss.item(), lr, batch_time, str(datetime.timedelta(seconds=eta)))) if args.visdom: update_vis_plot(viz, iteration, arm_loss_l.item(), arm_loss_c.item(), iter_plot, epoch_plot, 'append') torch.save( refinedet_net.state_dict(), args.save_folder + '/RefineDet{}_{}_final.pth'.format(args.input_size, args.dataset))
def main(): global args global minmum_loss args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size ## DATA loading code if args.dataset == 'COCO': train_sets = [('2014', 'train'), ('2014', 'valminusminival')] cfg = (COCO_300, COCO_512)[args.size == '512'] elif args.dataset == 'VOC': train_sets = [('2007', 'trainval'), ('2012', 'trainval')] cfg = (VOC_300, VOC_512)[args.size == '512'] # other impoort parmeters img_dim = (300, 512)[args.size == '512'] rgb_means = ((104, 117, 123), (103.94, 116.78, 123.68))[args.version == 'RFB_mobile'] p = (0.6, 0.2)[args.version == 'RFB_mobile'] num_classes = (21, 81)[args.dataset == 'COCO'] if args.dataset == 'COCO': dataset = COCODetection(root=cfg['coco_root'], image_sets=train_sets, preproc=preproc(img_dim, rgb_means, p)) elif args.dataset == 'VOC': dataset = VOCDetection(root=cfg['voc_root'], image_sets=train_sets, preproc=preproc(img_dim, rgb_means, p), target_transform=AnnotationTransform()) print('Training SSD on:', dataset.name) print('Loading the dataset...') train_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) print("Build RFB network") if args.version == 'RFB_vgg': model = RFB_Net_vgg('train', img_dim, num_classes) elif args.version == 'RFB_E_vgg': model = RFB_Net_E_vgg('train', img_dim, num_classes) elif args.version == 'RFB_mobile': model = RFB_Net_mobile('train', img_dim, num_classes) else: print('Unkown version!') if args.pretrained: base_weights = torch.load(args.save_folder + args.basenet) print('Loading base network...') model.base.load_state_dict(base_weights) model = model.cuda() # optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False) ## get the priorbox of ssd priorbox = PriorBox(cfg) with torch.no_grad(): priors = priorbox.forward() priors = priors.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] minmum_loss = checkpoint['minmum_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: print('Initializing weights...') # initialize newly added layers' weights with xavier method model.extras.apply(weights_init) model.loc.apply(weights_init) model.conf.apply(weights_init) model.Norm.apply(weights_init) if args.version == 'RFB_E_vgg': model.reduce.apply(weights_init) model.up_reduce.apply(weights_init) print('Using the specified args:') print(args) for epoch in range(args.start_epoch, args.epochs): # train for one epoch loss = train(train_loader, model, priors, criterion, optimizer, epoch) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = loss < minmum_loss minmum_loss = min(loss, minmum_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': minmum_loss, 'optimizer': optimizer.state_dict(), }, is_best, epoch)
def SSD300(input_shape, num_classes=21): """SSD300 architecture. # Arguments input_shape: Shape of the input image, expected to be either (300, 300, 3) or (3, 300, 300)(not tested). num_classes: Number of classes including background. # References https://arxiv.org/abs/1512.02325 """ print('begin building networks') kernel_size = (3, 3) net = {} # Block 1 input_tensor = Input(shape=input_shape) img_size = (input_shape[1], input_shape[0]) net['input'] = input_tensor net['conv1_1'] = Conv2D(64, kernel_size, activation='relu', padding='same', name='conv1_1')(net['input']) net['conv1_2'] = Conv2D(64, kernel_size, activation='relu', padding='same', name='conv1_2')(net['conv1_1']) net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(net['conv1_2']) # Block 2 net['conv2_1'] = Conv2D(128, kernel_size, activation='relu', padding='same', name='conv2_1')(net['pool1']) net['conv2_2'] = Conv2D(128, kernel_size, activation='relu', padding='same', name='conv2_2')(net['conv2_1']) net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(net['conv2_2']) # Block 3 net['conv3_1'] = Conv2D(256, kernel_size, activation='relu', padding='same', name='conv3_1')(net['pool2']) net['conv3_2'] = Conv2D(256, kernel_size, activation='relu', padding='same', name='conv3_2')(net['conv3_1']) net['conv3_3'] = Conv2D(256, kernel_size, activation='relu', padding='same', name='conv3_3')(net['conv3_2']) net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(net['conv3_3']) # Block 4 net['conv4_1'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv4_1')(net['pool3']) net['conv4_2'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv4_2')(net['conv4_1']) net['conv4_3'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv4_3')(net['conv4_2']) net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(net['conv4_3']) # Block 5 net['conv5_1'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv5_1')(net['pool4']) net['conv5_2'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv5_2')(net['conv5_1']) net['conv5_3'] = Conv2D(512, kernel_size, activation='relu', padding='same', name='conv5_3')(net['conv5_2']) net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), padding='same', name='pool5')(net['conv5_3']) # FC6 net['fc6'] = Conv2D(1024, kernel_size, dilation_rate=(6, 6), activation='relu', padding='same', name='fc6')(net['pool5']) # x = Dropout(0.5, name='drop6')(x) # FC7 net['fc7'] = Conv2D(1024, (1, 1), activation='relu', padding='same', name='fc7')(net['fc6']) # x = Dropout(0.5, name='drop7')(x) # Block 6 net['conv6_1'] = Conv2D(256, (1, 1), activation='relu', padding='same', name='conv6_1')(net['fc7']) net['conv6_2'] = Conv2D(512, kernel_size, strides=(2, 2), activation='relu', padding='same', name='conv6_2')(net['conv6_1']) # Block 7 net['conv7_1'] = Conv2D(128, (1, 1), activation='relu', padding='same', name='conv7_1')(net['conv6_2']) net['conv7_2'] = ZeroPadding2D()(net['conv7_1']) net['conv7_2'] = Conv2D(256, kernel_size, strides=(2, 2), activation='relu', padding='valid', name='conv7_2')(net['conv7_2']) # Block 8 net['conv8_1'] = Conv2D(128, (1, 1), activation='relu', padding='same', name='conv8_1')(net['conv7_2']) net['conv8_2'] = Conv2D(256, kernel_size, strides=(2, 2), activation='relu', padding='same', name='conv8_2')(net['conv8_1']) # Last Pool net['pool6'] = GlobalAveragePooling2D(name='pool6')(net['conv8_2']) print('base network built') # Prediction from conv4_3 net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3']) num_priors = 3 x = Conv2D(num_priors * 4, kernel_size, padding='same', name='conv4_3_norm_mbox_loc')(net['conv4_3_norm']) net['conv4_3_norm_mbox_loc'] = x flatten = Flatten(name='conv4_3_norm_mbox_loc_flat') net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc']) name = 'conv4_3_norm_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, kernel_size, padding='same', name=name)(net['conv4_3_norm']) net['conv4_3_norm_mbox_conf'] = x flatten = Flatten(name='conv4_3_norm_mbox_conf_flat') net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf']) priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2], variances=[0.1, 0.1, 0.2, 0.2], name='conv4_3_norm_mbox_priorbox') net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm']) print('conv4_3_norm_mbox_priorbox built') # Prediction from fc7 num_priors = 6 net['fc7_mbox_loc'] = Conv2D(num_priors * 4, kernel_size, padding='same', name='fc7_mbox_loc')(net['fc7']) flatten = Flatten(name='fc7_mbox_loc_flat') net['fc7_mbox_loc_flat'] = flatten(net['fc7_mbox_loc']) name = 'fc7_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) net['fc7_mbox_conf'] = Conv2D(num_priors * num_classes, (3, 3), padding='same', name=name)(net['fc7']) flatten = Flatten(name='fc7_mbox_conf_flat') net['fc7_mbox_conf_flat'] = flatten(net['fc7_mbox_conf']) priorbox = PriorBox(img_size, 60.0, max_size=114.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='fc7_mbox_priorbox') net['fc7_mbox_priorbox'] = priorbox(net['fc7']) print('fc7_mbox_priorbox built') # Prediction from conv6_2 num_priors = 6 x = Conv2D(num_priors * 4, kernel_size, padding='same', name='conv6_2_mbox_loc')(net['conv6_2']) net['conv6_2_mbox_loc'] = x flatten = Flatten(name='conv6_2_mbox_loc_flat') net['conv6_2_mbox_loc_flat'] = flatten(net['conv6_2_mbox_loc']) name = 'conv6_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, kernel_size, padding='same', name=name)(net['conv6_2']) net['conv6_2_mbox_conf'] = x flatten = Flatten(name='conv6_2_mbox_conf_flat') net['conv6_2_mbox_conf_flat'] = flatten(net['conv6_2_mbox_conf']) priorbox = PriorBox(img_size, 114.0, max_size=168.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv6_2_mbox_priorbox') net['conv6_2_mbox_priorbox'] = priorbox(net['conv6_2']) print('conv6_2_mbox_priorbox built') # Prediction from conv7_2 num_priors = 6 x = Conv2D(num_priors * 4, kernel_size, padding='same', name='conv7_2_mbox_loc')(net['conv7_2']) net['conv7_2_mbox_loc'] = x flatten = Flatten(name='conv7_2_mbox_loc_flat') net['conv7_2_mbox_loc_flat'] = flatten(net['conv7_2_mbox_loc']) name = 'conv7_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, kernel_size, padding='same', name=name)(net['conv7_2']) net['conv7_2_mbox_conf'] = x flatten = Flatten(name='conv7_2_mbox_conf_flat') net['conv7_2_mbox_conf_flat'] = flatten(net['conv7_2_mbox_conf']) priorbox = PriorBox(img_size, 168.0, max_size=222.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv7_2_mbox_priorbox') net['conv7_2_mbox_priorbox'] = priorbox(net['conv7_2']) print('conv7_2_mbox_priorbox built') # Prediction from conv8_2 num_priors = 6 x = Conv2D(num_priors * 4, kernel_size, padding='same', name='conv8_2_mbox_loc')(net['conv8_2']) net['conv8_2_mbox_loc'] = x flatten = Flatten(name='conv8_2_mbox_loc_flat') net['conv8_2_mbox_loc_flat'] = flatten(net['conv8_2_mbox_loc']) name = 'conv8_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, kernel_size, padding='same', name=name)(net['conv8_2']) net['conv8_2_mbox_conf'] = x flatten = Flatten(name='conv8_2_mbox_conf_flat') net['conv8_2_mbox_conf_flat'] = flatten(net['conv8_2_mbox_conf']) priorbox = PriorBox(img_size, 222.0, max_size=276.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv8_2_mbox_priorbox') net['conv8_2_mbox_priorbox'] = priorbox(net['conv8_2']) print('conv8_2_mbox_priorbox built') # Prediction from pool6 num_priors = 6 x = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(net['pool6']) net['pool6_mbox_loc_flat'] = x name = 'pool6_mbox_conf_flat' if num_classes != 21: name += '_{}'.format(num_classes) x = Dense(num_priors * num_classes, name=name)(net['pool6']) net['pool6_mbox_conf_flat'] = x priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='pool6_mbox_priorbox') if K.image_dim_ordering() == 'tf': target_shape = (1, 1, 256) else: target_shape = (256, 1, 1) net['pool6_reshaped'] = Reshape(target_shape, name='pool6_reshaped')(net['pool6']) net['pool6_mbox_priorbox'] = priorbox(net['pool6_reshaped']) print('pool6_mbox_priorbox built') # Gather all predictions net['mbox_loc'] = concatenate([ net['conv4_3_norm_mbox_loc_flat'], net['fc7_mbox_loc_flat'], net['conv6_2_mbox_loc_flat'], net['conv7_2_mbox_loc_flat'], net['conv8_2_mbox_loc_flat'], net['pool6_mbox_loc_flat'] ], axis=1, name='mbox_loc') net['mbox_conf'] = concatenate([ net['conv4_3_norm_mbox_conf_flat'], net['fc7_mbox_conf_flat'], net['conv6_2_mbox_conf_flat'], net['conv7_2_mbox_conf_flat'], net['conv8_2_mbox_conf_flat'], net['pool6_mbox_conf_flat'] ], axis=1, name='mbox_conf') net['mbox_priorbox'] = concatenate([ net['conv4_3_norm_mbox_priorbox'], net['fc7_mbox_priorbox'], net['conv6_2_mbox_priorbox'], net['conv7_2_mbox_priorbox'], net['conv8_2_mbox_priorbox'], net['pool6_mbox_priorbox'] ], axis=1, name='mbox_priorbox') print('gathering all prediction layers built') if hasattr(net['mbox_loc'], '_keras_shape'): # divide 4 for [xmin, ymin, xmax, ymax] num_boxes = net['mbox_loc']._keras_shape[-1] // 4 elif hasattr(net['mbox_loc'], 'int_shape'): num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4 net['mbox_loc'] = Reshape((num_boxes, 4), name='mbox_loc_final')(net['mbox_loc']) net['mbox_conf'] = Reshape((num_boxes, num_classes), name='mbox_conf_logits')(net['mbox_conf']) net['mbox_conf'] = Activation('softmax', name='mbox_conf_final')(net['mbox_conf']) net['predictions'] = concatenate( [net['mbox_loc'], net['mbox_conf'], net['mbox_priorbox']], axis=2, name='predictions') print('prediction layers built') model = Model(net['input'], net['predictions']) return model
class TBPP(nn.Module): def __init__(self, phase, size, base, extras, head, num_classes): super(TBPP, self).__init__() self.phase = phase self.num_classes = num_classes self.cfg = { 'num_classes': 2, 'lr_steps': (80000, 100000, 120000), 'max_iter': 120000, 'feature_maps': [64, 32, 16, 8, 4, 2, 1], 'min_dim': 512, 'steps': [8, 16, 32, 64, 128, 256, 512], 'min_sizes': [20, 51, 133, 215, 296, 378, 460], 'max_sizes': [51, 133, 215, 296, 378, 460, 542], 'aspect_ratios': [[2, 3], [2, 3, 5], [2, 3, 5], [2, 3, 5], [2, 3, 5], [2, 3], [2, 3]], # TODO 'variance': [0.1, 0.2], 'clip': True, 'name': 'MINE' } self.priorbox = PriorBox( self.cfg) # calculate the size of prior boxes, i.e. defaults boxes self.priors = Variable(self.priorbox.forward(), volatile=True) self.size = size # TBPP network self.vgg = nn.ModuleList(base) self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) if phase == 'test': self.softmax = nn.Softmax(dim=-1) self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) def forward(self, x): sources = list() loc = list() conf = list() # apply vgg up to conv4_3 relu for k in range(23): x = self.vgg[k](x) s = self.L2Norm(x) sources.append(s) # apply vgg up to fc7 for k in range(23, len(self.vgg)): x = self.vgg[k](x) sources.append(x) # apply extra layers and cache source layer outputs for k, v in enumerate(self.extras): x = F.relu(v(x), inplace=True) if k % 2 == 1: sources.append(x) # apply multibox head to source layers for (x, l, c) in zip(sources, self.loc, self.conf): loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) if self.phase == "test": output = self.detect( loc.view(loc.size(0), -1, 4), # loc predictions self.softmax(conf.view(conf.size(0), -1, self.num_classes)), # conf predictions self.priors.type(type( x.data))) # prior boxes, i.e. default boxes else: output = (loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors) return output def load_weights(self, base_file): other, ext = os.path.splitext(base_file) if ext == '.pkl' or '.pth': print('Loading weights into state dict ...') self.load_state_dict( torch.load(base_file, map_location=lambda storage, loc: storage)) print('Loaded!') else: print('Sorry, only .pth and .pkl files are supported.')
class SSD(nn.Module): """Single Shot Multibox Architecture The network is composed of a base VGG network followed by the added multibox conv layers. Each multibox layer branches into 1) conv2d for class conf scores 2) conv2d for localization predictions 3) associated priorbox layer to produce default bounding boxes specific to the layer's feature map size. See: https://arxiv.org/pdf/1512.02325.pdf for more details. Args: phase: (string) Can be "test" or "train" base: VGG16 layers for input, size of either 300 or 500 extras: extra layers that feed to multibox loc and conf layers head: "multibox head" consists of loc and conf conv layers """ def __init__(self, config, phase, base, extras, head, num_classes, top_k=200): super(SSD, self).__init__() self.phase = phase self.num_classes = num_classes # TODO: implement __call__ in PriorBox self.priorbox = PriorBox(config) self.priors = Variable(self.priorbox.forward(), volatile=True) # SSD network self.vgg = nn.ModuleList(base) # Layer learns to scale the l2 normalized features from conv4_3 self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) if phase == 'test': self.softmax = nn.Softmax() self.detect = Detect(num_classes, 0, top_k, 0.01, 0.45, variance=config['variance']) def forward(self, x): """Applies network layers and ops on input image(s) x. Args: x: input image or batch of images. Shape: [batch,3*batch,300,300]. Return: Depending on phase: test: Variable(tensor) of output class label predictions, confidence score, and corresponding location predictions for each object detected. Shape: [batch,topk,7] train: list of concat outputs from: 1: confidence layers, Shape: [batch*num_priors,num_classes] 2: localization layers, Shape: [batch,num_priors*4] 3: priorbox layers, Shape: [2,num_priors*4] """ sources = list() loc = list() conf = list() # apply vgg up to conv4_3 relu for k in range(23): x = self.vgg[k](x) s = self.L2Norm(x) sources.append(s) # apply vgg up to fc7 for k in range(23, len(self.vgg)): x = self.vgg[k](x) sources.append(x) # apply extra layers and cache source layer outputs for k, v in enumerate(self.extras): x = F.relu(v(x), inplace=True) if k % 2 == 1: sources.append(x) # apply multibox head to source layers for (x, l, c) in zip(sources, self.loc, self.conf): loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) if self.phase == "test": output = self.detect( loc.view(loc.size(0), -1, 4), # loc preds self.softmax(conf.view(-1, self.num_classes)), # conf preds self.priors.type(type(x.data)) # default boxes ) else: output = (loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors) return output def load_weights(self, base_file): other, ext = os.path.splitext(base_file) if ext == '.pkl' or '.pth': print('Loading weights into state dict...') self.load_state_dict( torch.load(base_file, map_location=lambda storage, loc: storage)) print('Finished!') else: print('Sorry only .pth and .pkl files supported.')
def simple_SSD(input_shape, num_classes, min_size, num_priors, max_size, aspect_ratios, variances): input_tensor = Input(shape=input_shape) body = Convolution2D(16, 7, 7)(input_tensor) body = Activation('relu')(body) body = MaxPooling2D(2, 2, border_mode='valid')(body) body = Convolution2D(32, 5, 5)(body) body = Activation('relu')(body) branch_1 = MaxPooling2D(2, 2, border_mode='valid')(body) body = Convolution2D(64, 3, 3)(branch_1) body = Activation('relu')(body) branch_2 = MaxPooling2D(2, 2, border_mode='valid')(body) # first branch norm_1 = Normalize(20)(branch_1) localization_1 = Convolution2D(num_priors * 4, 3, 3, border_mode='same')(norm_1) localization_1 = Flatten(localization_1) classification_1 = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same')(norm_1) classification_1 = Flatten()(classification_1) prior_boxes_1 = PriorBox(input_shape[0:2], min_size, max_size, aspect_ratios) # second branch norm_2 = Normalize(20)(branch_2) localization_2 = Convolution2D(num_priors * 4, 3, 3, border_mode='same')(norm_2) localization_2 = Flatten(localization_2) classification_2 = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same')(norm_2) classification_2 = Flatten()(classification_2) prior_boxes_2 = PriorBox(input_shape[0:2], min_size, max_size, aspect_ratios) localization_head = Merge([localization_1, localization_2], mode='concat', concat_axis=1) classification_head = Merge([classification_1, classification_2], mode='concat', concat_axis=1) prior_boxes_head = Merge([prior_boxes_1, prior_boxes_2], mode='concat', concat_axis=1) if hasattr(localization_head, '_keras_shape'): num_boxes = localization_head._keras_shape[-1] // 4 elif hasattr(localization_head, 'int_shape'): num_boxes = K.int_shape(localization_head)[-1] // 4 localization_head = Reshape((num_boxes, 4))(localization_head) classification_head = Reshape( (num_boxes, num_classes))(classification_head) classification_head = Activation('softmax')(classification_head) predictions = Merge(localization_head, classification_head, prior_boxes_head, mode='concat', concat_axis=2) model = Model(input_tensor, predictions) return model
class SSD300(nn.Module): def __init__(self, num_classes, phase, pretrain=False, finetune=None): super(SSD300, self).__init__() self.num_classes = num_classes self.phase = phase self.base_net = self._base_net() self.extra_net = self._extra_net() self.loc_pred, self.cls_pred = self._predict_net() self.L2Norm = L2Norm(512, 20) self.priorbox = PriorBox(v2) self.priors = Variable(self.priorbox.forward(), volatile=True) if phase == 'test': self.softmax = nn.Softmax() self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) self._init_weight() if pretrain: self._load_weight() if finetune is not None: self._finetune(finetune) def _base_net(self): """Use vgg16 as basenet. Refer https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py. Returns: basenet: (ModuleList) """ def make_layers(cfg, batch_norm=False): layers = [] in_channels = 3 for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] elif v == 'C': layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] else: conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) if batch_norm: layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] else: layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) conv7 = nn.Conv2d(1024, 1024, kernel_size=1) layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] return layers cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 512, 512, 512] return nn.ModuleList(make_layers(cfg)) def _extra_net(self): """Extra layers in SSD300, conv8,9,10,11 Refer https://arxiv.org/pdf/1512.02325.pdf Returns: extra_net: (ModuleList) """ def make_layers(cfg, batch_norm=False): layers = [] in_channels = 1024 flag = False for i, v in enumerate(cfg): if in_channels == 'S': in_channels = v continue _kerner_size = (1,3)[flag] if v == 'S': conv = nn.Conv2d(in_channels, cfg[i+1], _kerner_size, stride=2, padding=1) else: conv = nn.Conv2d(in_channels, v, _kerner_size) layers += [conv] in_channels = v flag = not flag return layers cfg = [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256] return nn.ModuleList(make_layers(cfg)) def _predict_net(self): """Predict layer, cls and loc Returns: loc_layers: [list], len=6 conf_layers: [list], len=6 """ loc_layers = [] conf_layers = [] in_channels = [512, 1024, 512, 256, 256, 256] mboxes = [4, 6, 6, 6, 4, 4] # number of boxes per feature map location for (in_channels, mbox) in zip(in_channels, mboxes): loc_layers += [nn.Conv2d(in_channels, mbox*4, kernel_size=3, padding=1)] conf_layers += [nn.Conv2d(in_channels, mbox*self.num_classes, kernel_size=3, padding=1)] return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers) def forward(self, x): """Apply network layers and ops on input image(s) x. Args: x (tensor): input image or batch of image. Shape: [batch, 3, 300, 300] Returns: Depending on phase; train: list of concat outputs from: 1: confidence layers, Shape: [batch*num_priors, num_classes] 2: localization layers, Shape: [batch, num_priors*4] 3: priorbox layers, Shape: [2, num_priors*4] test: Variale(tensor) of input class label predictions .. """ sources = [] # feature maps where to make predictions conf = [] loc = [] # apply vgg, without BatchNorm pred_index = [22,] # conv4_3 relu for k, v in enumerate(self.base_net): x = v(x) if k in pred_index: sources.append(self.L2Norm(x)) sources.append(x) # apply extra_net and cache source layer outputs pred_index = [1,3,5,7] for k, v in enumerate(self.extra_net): x = v(x) if k in pred_index: sources.append(x) # apply predict_net to source layers for (x, l, c) in zip(sources, self.loc_pred, self.cls_pred): loc.append(l(x).permute(0,2,3,1).contiguous()) # [B,C,H,W] -> [B,H,W,C] conf.append(c(x).permute(0,2,3,1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) # to concat pred from many layers conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) if self.phase == 'train': output = ( loc.view(loc.size(0), -1, 4), conf.view(loc.size(0), -1, self.num_classes), self.priors ) else: output = self.detect( loc.view(loc.size(0), -1, 4), self.softmax(conf.view(-1, self.num_classes)), self.priors.type(type(x.data)) ) return output def _init_weight(self): def weight_init(m): if isinstance(m, nn.Conv2d): init.xavier_uniform(m.weight.data) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) if m.bias is not None: m.bias.data.zero_() self.apply(weight_init) def _fetch_weight(self): """Fetch pretrain model using torchvision. Returns: weight_file: (str) pretrained weight file path """ print('Fetching pretrained model...') vgg16 = models.vgg16(pretrained=True) model_file = os.path.join(os.environ['HOME'], '.torch/models', 'vgg16-*.pth') return glob.glob(model_file)[0] def _load_weight(self, weight_file=None): """Load pretrained model. source: features.[0-28].[weight,bias], classifier.[0,3,6].[weight,bias] target: base_net.[0-28].[weight,bias], base_net.[31,33].[weight,bias], -> (load pretrained model) extra_net.[0-7].[xx], loc_pred.[0-5].[xx], cls_pred.[0-5].[xx] -> (init) Kwargs: weight_file (str): *.pth file path Returns: None """ if weight_file == None: weight_file = self._fetch_weight() _, ext = os.path.splitext(weight_file) def downsample(fc, layer): """ downsample weight and bias in fc6,fc7 to conv6,conv7 w: [512,7,7,4096] -> [512,3,3,1024] fc6 [4096, 4096] -> [1024, 1, 1, 1024] fc7 b: [4096] -> [1024] """ fc = fc.view(4, 1024, -1)[0] # [4096, 512*7*7] -> [4, 1024, -1][0], if fc.size(1) > 1: # weight if layer == 'fc6': fc = fc.view(1024, 512, 7, 7)[:, :, 0::3, 0::3] elif layer == 'fc7': fc = fc.view(4, 1024, 1024, 1, 1)[0] else: fc = fc[:,0] return fc if ext == '.pkl' or '.pth': source_dict = torch.load(weight_file) # features -> base_net, remove target_dict = {} for key in source_dict.keys(): if 'features' in key: # conv1-5 target_dict['base_net'+key[8:]] = source_dict[key] elif 'classifier.0' in key: # conv6 target_dict['base_net.31'+key[12:]] = downsample(source_dict[key], 'fc6') elif 'classifier.3' in key: # conv7 target_dict['base_net.33'+key[12:]] = downsample(source_dict[key], 'fc7') source_dict = target_dict # add for (key, value) in self.state_dict().items(): if key not in target_dict.keys(): target_dict[key] = value self.load_state_dict(target_dict) print('Loading imagenet weight successfully!') else: print('Sorry, only .pth and .pkl') def _finetune(self, weight_file): _, ext = os.path.splitext(weight_file) if ext == '.pkl' or '.pth': source_dict = torch.load(weight_file) # remove num_classes-awared layers target_dict = {} for key in source_dict.keys(): if 'cls_pred' not in key: # conv1-5 target_dict[key] = source_dict[key] # add for (key, value) in self.state_dict().items(): if key not in target_dict.keys(): target_dict[key] = value self.load_state_dict(target_dict) print('Loading finetune weight successfully!') else: print('Sorry, only .pth and .pkl')
class DSOD_64_16(nn.Module): def __init__(self, num_classes): super(DSOD_64_16, self).__init__() self.num_classes = num_classes self.extractor = DenseNet_64_16() self.loc_layers = nn.ModuleList() self.cls_layers = nn.ModuleList() self.cfg = cfg_320_64_16 self.priorbox = PriorBox(self.cfg) self.priors = self.priorbox.forward() in_channels = channel_dict['6416'] num_anchors = (4, 6, 6, 6, 4, 4) for inC, num_anchor in zip(in_channels, num_anchors): # self.loc_layers += [nn.Conv2d(inC, num_anchor*4, kernel_size=3, padding=1)] # self.cls_layers += [nn.Conv2d(inC, num_anchor* num_classes, kernel_size=3, padding=1) # ] self.loc_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * 4, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(num_anchor * 4)) ] self.cls_layers += [ nn.Sequential( nn.Conv2d(inC, num_anchor * num_classes, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(num_anchor * num_classes)) ] self.normalize = nn.ModuleList( [L2Norm(chan, 20) for chan in in_channels]) self.reset_parameters() def forward(self, x): loc_preds = [] cls_preds = [] xs = self.extractor(x) for i, x in enumerate(xs): x = self.normalize[i](x) loc_pred = self.loc_layers[i](x) loc_pred = loc_pred.permute(0, 2, 3, 1).contiguous() loc_preds.append(loc_pred.view(loc_pred.size(0), -1, 4)) cls_pred = self.cls_layers[i](x) cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous() cls_preds.append( cls_pred.view(cls_pred.size(0), -1, self.num_classes)) # loc_preds = torch.cat(loc_preds, 1) # cls_preds = torch.cat(cls_preds, 1) loc = torch.cat([o.view(o.size(0), -1) for o in loc_preds], 1) conf = torch.cat([o.view(o.size(0), -1) for o in cls_preds], 1) output = (loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors) return output def reset_parameters(self): for name, param in self.extractor.named_parameters(): if hasattr(param, 'weight'): nn.init.xavier_uniform(param.weight.data, gain=nn.init.calculate_gain('relu')) for name, param in self.loc_layers.named_parameters(): if hasattr(param, 'weight'): nn.init.normal(param.weight.data, std=0.01) for name, param in self.cls_layers.named_parameters(): if hasattr(param, 'weight'): nn.init.normal(param.weight.data, std=0.01)
cv2.rectangle(img, (p1[0] - 2 // 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]), [255, 0, 0], -1) cv2.putText(img, conf, (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (255, 255, 255), 1, 8) t2 = time.time() print('detect:{} timer:{}'.format(img_path, t2 - t1)) cv2.imwrite(os.path.join(args.save_dir, os.path.basename(img_path)), img) if __name__ == '__main__': # load PriorBox with torch.no_grad(): priorbox = PriorBox(input_size=[640, 640], cfg=cfg) priors = priorbox.forward() priors = priors.cuda() net = build_net('test', cfg.NUM_CLASSES) net.load_state_dict(torch.load(args.model)) net.eval() if use_cuda: net.cuda() cudnn.benckmark = True img_path = './img' img_list = [ os.path.join(img_path, x) for x in os.listdir(img_path) if x.endswith('jpg')
def main(): global args global minmum_loss args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size # build dsfd network print("Building net...") pyramidbox = build_net('train', cfg.NUM_CLASSES) model = pyramidbox if args.pretrained: vgg_weights = torch.load(args.save_folder + args.basenet) print('Load base network....') model.vgg.load_state_dict(vgg_weights) # for multi gpu if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model) model = model.cuda() # optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion1 = MultiBoxLoss(cfg, True) criterion2 = MultiBoxLoss(cfg, True, use_head_loss=True) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] minmum_loss = checkpoint['minmum_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: print('Initializing weights...') pyramidbox.extras.apply(pyramidbox.weights_init) pyramidbox.lfpn_topdown.apply(pyramidbox.weights_init) pyramidbox.lfpn_later.apply(pyramidbox.weights_init) pyramidbox.cpm.apply(pyramidbox.weights_init) pyramidbox.loc_layers.apply(pyramidbox.weights_init) pyramidbox.conf_layers.apply(pyramidbox.weights_init) print('Loading wider dataset...') train_dataset = WIDERDetection(cfg.FACE.TRAIN_FILE, mode='train') val_dataset = WIDERDetection(cfg.FACE.VAL_FILE, mode='val') train_loader = data.DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) val_batchsize = args.batch_size // 2 val_loader = data.DataLoader(val_dataset, val_batchsize, num_workers=args.num_workers, shuffle=False, collate_fn=detection_collate, pin_memory=True) print('Using the specified args:') print(args) # load PriorBox with torch.no_grad(): priorbox = PriorBox(input_size=[640, 640], cfg=cfg) priors = priorbox.forward() priors = priors.cuda() for epoch in range(args.start_epoch, args.epochs): # train for one epoch end = time.time() train_loss = train(train_loader, model, priors, criterion1, criterion2, optimizer, epoch) val_loss = val(val_loader, model, priors, criterion1, criterion2) if args.local_rank == 0: is_best = val_loss < minmum_loss minmum_loss = min(val_loss, minmum_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': minmum_loss, 'optimizer': optimizer.state_dict(), }, is_best, epoch) epoch_time = time.time() - end print('Epoch %s time cost %f' % (epoch, epoch_time))
def main(): global args global minmum_loss args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size # build dsfd network print("Building net...") model = RetinaFace(cfg=cfg) print("Printing net...") # for multi gpu if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model) model = model.cuda() # optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.35, True, 0, True, 7, 0.35, False) ## dataset print("loading dataset") train_dataset = WiderFaceDetection( args.training_dataset, preproc(cfg['image_size'], cfg['rgb_mean'])) train_loader = data.DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] minmum_loss = checkpoint['minmum_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) print('Using the specified args:') print(args) # load PriorBox print("Load priorbox") with torch.no_grad(): priorbox = PriorBox(cfg=cfg, image_size=(cfg['image_size'], cfg['image_size'])) priors = priorbox.forward() priors = priors.cuda() print("start traing") for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss = train(train_loader, model, priors, criterion, optimizer, epoch) if args.local_rank == 0: is_best = train_loss < minmum_loss minmum_loss = min(train_loss, minmum_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': minmum_loss, 'optimizer': optimizer.state_dict(), }, is_best, epoch)
def mini_SSD300(input_shape=(300,300,3), num_classes=21): """SSD300 architecture. # Arguments input_shape: Shape of the input image, expected to be either (300, 300, 3) or (3, 300, 300)(not tested). num_classes: Number of classes including background. # References https://arxiv.org/abs/1512.02325 """ net = {} # Block 1 input_tensor = input_tensor = Input(shape=input_shape) img_size = (input_shape[1], input_shape[0]) net['input'] = input_tensor net['conv1_1'] = Convolution2D(64, 3, 3, activation='relu', border_mode='same', name='conv1_1')(net['input']) net['conv1_2'] = Convolution2D(64, 3, 3, activation='relu', border_mode='same', name='conv1_2')(net['conv1_1']) net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool1')(net['conv1_2']) # Block 2 net['conv2_1'] = Convolution2D(128, 3, 3, activation='relu', border_mode='same', name='conv2_1')(net['pool1']) net['conv2_2'] = Convolution2D(128, 3, 3, activation='relu', border_mode='same', name='conv2_2')(net['conv2_1']) net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool2')(net['conv2_2']) # Block 3 net['conv3_1'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_1')(net['pool2']) net['conv3_2'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_2')(net['conv3_1']) net['conv3_3'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_3')(net['conv3_2']) net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool3')(net['conv3_3']) # Block 4 net['conv4_1'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv4_1')(net['pool3']) net['conv4_2'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv4_2')(net['conv4_1']) net['conv4_3'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv4_3')(net['conv4_2']) net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool4')(net['conv4_3']) # Block 5 net['conv5_1'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv5_1')(net['pool4']) net['conv5_2'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv5_2')(net['conv5_1']) net['conv5_3'] = Convolution2D(512, 3, 3, activation='relu', border_mode='same', name='conv5_3')(net['conv5_2']) net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode='same', name='pool5')(net['conv5_3']) # FC6 net['fc6'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6), activation='relu', border_mode='same', name='fc6')(net['pool5']) # x = Dropout(0.5, name='drop6')(x) # FC7 net['fc7'] = Convolution2D(1024, 1, 1, activation='relu', border_mode='same', name='fc7')(net['fc6']) # x = Dropout(0.5, name='drop7')(x) # Block 6 # deleted net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3']) num_priors = 3 x = Convolution2D(num_priors * 4, 3, 3, border_mode='same', name='conv4_3_norm_mbox_loc')(net['conv4_3_norm']) net['conv4_3_norm_mbox_loc'] = x flatten = Flatten(name='conv4_3_norm_mbox_loc_flat') net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc']) name = 'conv4_3_norm_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same', name=name)(net['conv4_3_norm']) net['conv4_3_norm_mbox_conf'] = x flatten = Flatten(name='conv4_3_norm_mbox_conf_flat') net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf']) priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2], variances=[0.1, 0.1, 0.2, 0.2], name='conv4_3_norm_mbox_priorbox') net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm']) # Prediction from fc7 num_priors = 6 net['fc7_mbox_loc'] = Convolution2D(num_priors * 4, 3, 3, border_mode='same', name='fc7_mbox_loc')(net['fc7']) flatten = Flatten(name='fc7_mbox_loc_flat') net['fc7_mbox_loc_flat'] = flatten(net['fc7_mbox_loc']) name = 'fc7_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) net['fc7_mbox_conf'] = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same', name=name)(net['fc7']) flatten = Flatten(name='fc7_mbox_conf_flat') net['fc7_mbox_conf_flat'] = flatten(net['fc7_mbox_conf']) priorbox = PriorBox(img_size, 60.0, max_size=114.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='fc7_mbox_priorbox') net['fc7_mbox_priorbox'] = priorbox(net['fc7']) # Gather all predictions net['mbox_loc'] = merge([net['conv4_3_norm_mbox_loc_flat'], net['fc7_mbox_loc_flat']], mode='concat', concat_axis=1, name='mbox_loc') net['mbox_conf'] = merge([net['conv4_3_norm_mbox_conf_flat'], net['fc7_mbox_conf_flat']], mode='concat', concat_axis=1, name='mbox_conf') if hasattr(net['mbox_loc'], '_keras_shape'): num_boxes = net['mbox_loc']._keras_shape[-1] // 4 elif hasattr(net['mbox_loc'], 'int_shape'): num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4 net['mbox_loc'] = Reshape((num_boxes, 4), name='mbox_loc_final')(net['mbox_loc']) net['mbox_conf'] = Reshape((num_boxes, num_classes), name='mbox_conf_logits')(net['mbox_conf']) net['mbox_conf'] = Activation('softmax', name='mbox_conf_final')(net['mbox_conf']) net['mbox_priorbox'] = merge([net['conv4_3_norm_mbox_priorbox'], net['fc7_mbox_priorbox']], mode='concat', concat_axis=1, name='mbox_priorbox') net['predictions'] = merge([net['mbox_loc'], net['mbox_conf'], net['mbox_priorbox']], mode='concat', concat_axis=2, name='predictions') model = Model(net['input'], net['predictions']) return model
def mini_SSD(num_classes=21): base_kernel_size = 4 + num_classes aspect_ratios = (1, 2, 1 / 2) num_aspect_ratios = len(aspect_ratios) base_model = VGG16(weights='imagenet') base_model.layers[0].name = 'input_1' input_tensor = base_model.input #input_tensor = base_model #input_tensor.name = 'image_array' for layer in base_model.layers: layer.trainable = False body = base_model.get_layer('block4_pool').output body = Convolution2D((base_kernel_size * num_aspect_ratios), 3, 3, border_mode='same')(body) branch_1 = PriorBox(aspect_ratios)(body) body = Convolution2D(32, 3, 3, border_mode='same')(branch_1) body = Activation('relu')(body) body = MaxPooling2D((2, 2))(body) body = Dropout(.5)(body) body = Convolution2D((base_kernel_size * num_aspect_ratios), 3, 3, border_mode='same')(body) branch_2 = PriorBox(aspect_ratios)(body) body = Convolution2D(64, 3, 3, border_mode='same')(branch_2) body = Activation('relu')(body) body = MaxPooling2D((3, 3))(body) body = Dropout(.5)(body) body = Convolution2D((base_kernel_size * num_aspect_ratios), 3, 3, border_mode='same')(body) branch_3 = PriorBox(aspect_ratios)(body) branch_1 = Reshape((-1, 4 + num_classes))(branch_1) local_1 = Lambda(lambda x: x[:, :, :4])(branch_1) class_1 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_1) branch_2 = Reshape((-1, 4 + num_classes))(branch_2) local_2 = Lambda(lambda x: x[:, :, :4])(branch_2) class_2 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_2) branch_3 = Reshape((-1, 4 + num_classes))(branch_3) local_3 = Lambda(lambda x: x[:, :, :4])(branch_3) class_3 = Lambda(lambda x: K.softmax(x[:, :, 4:]))(branch_3) classification_tensor = merge([class_1, class_2, class_3], mode='concat', concat_axis=1, name='classes') localization_tensor = merge([local_1, local_2, local_3], mode='concat', concat_axis=1, name='encoded_box') output_tensor = merge([localization_tensor, classification_tensor], mode='concat', concat_axis=-1, name='predictions') model = Model(input_tensor, output_tensor) return model
def mini_SSD300(input_shape=(300, 300, 3), num_classes=21): net = {} # Block 1 input_tensor = input_tensor = Input(shape=input_shape) img_size = (input_shape[1], input_shape[0]) net['input'] = input_tensor net['conv1_1'] = Convolution2D(64, 3, 3, activation='relu', border_mode='same', name='conv1_1')(net['input']) net['conv1_2'] = Convolution2D(64, 3, 3, activation='relu', border_mode='same', name='conv1_2')(net['conv1_1']) net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool1')(net['conv1_2']) # Block 2 net['conv2_1'] = Convolution2D(128, 3, 3, activation='relu', border_mode='same', name='conv2_1')(net['pool1']) net['conv2_2'] = Convolution2D(128, 3, 3, activation='relu', border_mode='same', name='conv2_2')(net['conv2_1']) net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool2')(net['conv2_2']) # Block 3 net['conv3_1'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_1')(net['pool2']) net['conv3_2'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_2')(net['conv3_1']) net['conv3_3'] = Convolution2D(256, 3, 3, activation='relu', border_mode='same', name='conv3_3')(net['conv3_2']) net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same', name='pool3')(net['conv3_3']) net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv3_3']) num_priors = 6 x = Convolution2D(num_priors * 4, 3, 3, border_mode='same', name='conv4_3_norm_mbox_loc')(net['conv4_3_norm']) net['conv4_3_norm_mbox_loc'] = x flatten = Flatten(name='conv4_3_norm_mbox_loc_flat') net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc']) name = 'conv4_3_norm_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same', name=name)(net['conv4_3_norm']) net['conv4_3_norm_mbox_conf'] = x flatten = Flatten(name='conv4_3_norm_mbox_conf_flat') net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf']) priorbox = PriorBox(img_size, 30.0, max_size=60, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2], name='conv4_3_norm_mbox_priorbox') net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm']) # Prediction from fc7 if hasattr(net['conv4_3_norm_mbox_loc_flat'], '_keras_shape'): num_boxes = net['conv4_3_norm_mbox_loc_flat']._keras_shape[-1] // 4 elif hasattr(net['mbox_loc'], 'int_shape'): num_boxes = K.int_shape(net['conv4_3_norm_mbox_loc_flat'])[-1] // 4 net['mbox_loc'] = Reshape((num_boxes, 4), name='mbox_loc_final')(net['conv4_3_norm_mbox_loc_flat']) net['mbox_conf'] = Reshape((num_boxes, num_classes), name='mbox_conf_logits')(net['conv4_3_norm_mbox_conf_flat']) net['mbox_conf'] = Activation('softmax', name='mbox_conf_final')(net['mbox_conf']) net['predictions'] = merge([net['mbox_loc'], net['mbox_conf'], net['conv4_3_norm_mbox_priorbox']], mode='concat', concat_axis=2, name='predictions') model = Model(net['input'], net['predictions']) return model