def __init__(self, num_classes, **kwargs): super(MobilenetSSD512, self).__init__() encoder = MobileNetV2() self.layer0 = encoder.layer0 self.layer1 = encoder.layer1 self.layer2 = encoder.layer2 self.layer3 = encoder.layer3 self.layer4 = encoder.layer4 self.layer5 = encoder.layer5 self.layer6 = encoder.layer6 self.layer7 = encoder.layer7 self.conv6 = nn.Conv2d(320, 64, kernel_size=3, stride=2, padding=1) self.conv7 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) self.conv8 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) self.conv9 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) # Top-down layers self.toplayer = nn.Conv2d(320, 64, kernel_size=1, stride=1, padding=0) # Lateral layers self.latlayer1 = nn.Conv2d(96, 64, kernel_size=1, stride=1, padding=0) self.latlayer2 = nn.Conv2d(32, 64, kernel_size=1, stride=1, padding=0) # Smooth layers self.smooth1 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) self.smooth2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) self.num_classes = num_classes + 1 # Dummy class self.loc_head = self._make_head(self.num_anchors * 4) self.cls_head = self._make_head(self.num_anchors * self.num_classes) self.box_coder = FPNSSDBoxCoder()
def __init__(self, num_classes, pretrained=True, **kwargs): super(FPNSSD512, self).__init__() self.fpn = FPN50() self.num_classes = num_classes + 1 # Dummy class self.loc_head = self._make_head(self.num_anchors * 4) self.cls_head = self._make_head(self.num_anchors * self.num_classes) self.box_coder = FPNSSDBoxCoder() resnet_state = resnet50(pretrained=pretrained).state_dict() self.fpn.load_state_dict(resnet_state, strict=False)
class BBoxMeanAP: def __init__(self, threshold=0.5): self.scores_per_image = [] self.threshold = threshold self.box_coder = FPNSSDBoxCoder() def reset(self): self.scores_per_image = [] def update(self, y_pred: Tensor, y_true: Tensor): true_ssd_bboxes = y_true[SSD_BBOXES_KEY].detach().cpu() pred_ssd_bboxes = y_pred[SSD_BBOXES_KEY].detach().cpu() pred_classes = y_pred[SSD_LABELS_KEY].detach().cpu() true_classes = y_true[SSD_LABELS_KEY].detach().cpu() pred_classes = pred_classes.softmax(dim=2) true_classes = one_hot(true_classes, num_classes=pred_classes.size(2)) for pred_loc, pred_cls, true_loc, true_cls in zip( pred_ssd_bboxes, pred_classes, true_ssd_bboxes, true_classes): pred_bboxes, _, pred_conf = self.box_coder.decode( pred_loc, pred_cls) true_bboxes, _, _ = self.box_coder.decode(true_loc, true_cls) true_bboxes = change_box_order(true_bboxes, 'xyxy2xywh') pred_bboxes = change_box_order(pred_bboxes, 'xyxy2xywh') true_bboxes = to_numpy(true_bboxes) pred_bboxes = to_numpy(pred_bboxes) pred_conf = to_numpy(pred_conf) if len(true_bboxes) == 0: continue if len(pred_bboxes) == 0: score = 0 else: score = map_iou(true_bboxes, pred_bboxes, pred_conf) self.scores_per_image.append(score) def __str__(self): return '%.4f' % self.value() def value(self): if len(self.scores_per_image) == 0: return 0 return np.mean(self.scores_per_image) def log_to_tensorboard(self, saver: SummaryWriter, prefix, step): if len(self.scores_per_image) > 0: saver.add_scalar(prefix + '/value', self.value(), step) saver.add_histogram(prefix + '/histogram', np.array(self.scores_per_image), step)
def test_sdd_box_coder(): box_coder = FPNSSDBoxCoder() # boxes = [ [20, 40, 80, 100], [200, 4, 300, 200], [100, 100, 160, 200], [50, 90, 175, 300], ] labels = [0, 0, 0, 0] loc_targets, cls_targets = box_coder.encode( torch.tensor(boxes, dtype=torch.float32), torch.tensor(labels, dtype=torch.float32)) dec_boxes, dec_labels, dec_scores = box_coder.decode( loc_targets, cls_targets) print(dec_boxes, dec_labels, dec_scores)
def test_anchors_count(): n = len(FPNSSDBoxCoder().anchor_boxes) print('Total number of anchor boxes SSD512', n) box_coder = RSSDBoxCoder(512, 512) n = len(box_coder.anchor_boxes) print('Total number of anchor boxes in RSSD512', n) box_coder = RSSDBoxCoder(768, 768) n = len(box_coder.anchor_boxes) print('Total number of anchor boxes in RSSD768', n)
def test_draw_rsdd_bboxes(): box_coder = RSSDBoxCoder(768, 768) anchors = box_coder._get_anchor_wht() n = len(FPNSSDBoxCoder().anchor_boxes) print('Total number of anchor boxes SSD', len(box_coder.anchor_boxes)) n = len(box_coder.anchor_boxes) print('Total number of anchor boxes in RSSD', len(box_coder.anchor_boxes)) for i, wht_fm in enumerate(anchors): image = np.zeros((box_coder.image_height, box_coder.image_width, 3), dtype=np.uint8) for wht in wht_fm: rbox = [ box_coder.image_height // 2, box_coder.image_width // 2, wht[0], wht[1], wht[2] ] visualize_rbbox(image, rbox, (i * 28, 255, 0), thickness=1) cv2.imshow("Image" + str(i), image) cv2.waitKey(-1)
def test_ssd_synthetic(): label_image = np.zeros((512, 512), dtype=np.uint8) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((100, 100), (100, 20), 0)), 1).astype(int), (1, 1, 1)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200, 100), (100, 20), 45)), 1).astype(int), (2, 2, 2)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((100, 200), (100, 20), 90)), 1).astype(int), (3, 3, 3)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200, 200), (100, 20), 135)), 1).astype(int), (4, 4, 4)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((100 + 200, 100), (20, 100), 0)), 1).astype(int), (5, 5, 5)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200 + 200, 100), (20, 100), 45)), 1).astype(int), (6, 6, 6)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((100 + 200, 200), (20, 100), 90)), 1).astype(int), (7, 7, 7)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200 + 200, 200), (20, 100), 135)), 1).astype(int), (8, 8, 8)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((100, 100 + 200), (100, 20), 17)), 1).astype(int), (9, 9, 9)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200, 100 + 200), (100, 20), 49)), 1).astype(int), (10, 10, 10)) cv2.fillConvexPoly( label_image, np.expand_dims(cv2.boxPoints(((100, 200 + 200), (100, 20), 99)), 1).astype(int), (11, 11, 11)) # cv2.fillConvexPoly(label_image, np.expand_dims(cv2.boxPoints(((200, 200 + 200), (100, 20), 165)), 1).astype(int), (12, 12, 12)) # cv2.fillConvexPoly(label_image, np.expand_dims(np.array([[25, 90], [125, 90], [125, 110], [25, 110]]), 1), (1, 1, 1)) # cv2.fillConvexPoly(label_image, np.expand_dims(np.array([[10, 400], [70, 400], [70, 420], [10, 420]]), 1), (3, 3, 3)) # cv2.fillConvexPoly(label_image, np.expand_dims(np.array([[100, 100], [200, 110], [200, 140], [110, 130]]), 1), (4, 4, 4)) # cv2.fillConvexPoly(label_image, np.expand_dims(np.array([[300, 200], [400, 210], [410, 330], [310, 330]]), 1), (5, 5, 5)) image = (label2rgb(label_image, bg_label=0) * 255).astype(np.uint8) # Test what happens if we rotate # image = np.rot90(image).copy() # label_image = np.rot90(label_image).copy() bboxes = instance_mask_to_bboxes(label_image) print(bboxes) labels = np.zeros(len(bboxes), dtype=np.intp) box_coder = FPNSSDBoxCoder() loc_targets, cls_targets, anchors = box_coder.encode( torch.from_numpy(bboxes).float(), torch.from_numpy(labels), return_anchors=True) print(loc_targets.shape, cls_targets.shape) cls_targets_one_hot = np.eye(2)[cls_targets] print(cls_targets_one_hot.shape) dec_boxes, dec_labels, dec_scores = box_coder.decode( loc_targets, torch.from_numpy(cls_targets_one_hot)) print(dec_boxes) for bbox in dec_boxes.numpy(): visualize_bbox(image, bbox, (255, 0, 255), thickness=3) for bbox in bboxes: visualize_bbox(image, bbox, (0, 255, 0), thickness=1) for bbox in anchors.numpy(): visualize_bbox(image, bbox, (255, 255, 255), thickness=1) cv2.imshow('overlays', image) # cv2.imshow('anchors', anchors) cv2.waitKey(-1)
class FPNSSD512(nn.Module): num_anchors = 9 def __init__(self, num_classes, pretrained=True, **kwargs): super(FPNSSD512, self).__init__() self.fpn = FPN50() self.num_classes = num_classes + 1 # Dummy class self.loc_head = self._make_head(self.num_anchors * 4) self.cls_head = self._make_head(self.num_anchors * self.num_classes) self.box_coder = FPNSSDBoxCoder() resnet_state = resnet50(pretrained=pretrained).state_dict() self.fpn.load_state_dict(resnet_state, strict=False) # new_state_dict = OrderedDict() # for k, v in resnet_state.items(): # if str.startswith(k, 'conv1'): # continue # new_state_dict[k] = v def forward(self, image): loc_preds = [] cls_preds = [] fms = self.fpn(image) for fm in fms: loc_pred = self.loc_head(fm) cls_pred = self.cls_head(fm) loc_pred = loc_pred.permute(0, 2, 3, 1).contiguous().view( image.size(0), -1, 4) # [N, 9*4,H,W] -> [N,H,W, 9*4] -> [N,H*W*9, 4] cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view( image.size(0), -1, self.num_classes ) # [N,9*NC,H,W] -> [N,H,W,9*NC] -> [N,H*W*9,NC] loc_preds.append(loc_pred) cls_preds.append(cls_pred) bboxes = torch.cat(loc_preds, 1) labels = torch.cat(cls_preds, 1) return bboxes, labels # return { # SSD_BBOXES_KEY: bboxes, # SSD_LABELS_KEY: labels, # } def _make_head(self, out_planes): layers = [] for _ in range(4): layers.append( nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)) layers.append(nn.ReLU(True)) layers.append( nn.Conv2d(256, out_planes, kernel_size=3, stride=1, padding=1)) return nn.Sequential(*layers) def predict(self, image): import albumentations as A self.eval() normalize = A.Normalize() image = normalize(image=image)['image'] slicer = ImageSlicer(image.shape, 512, 512 // 2) patches = [ tensor_from_rgb_image(patch) for patch in slicer.split(image, borderType=cv2.BORDER_CONSTANT) ] offsets = torch.tensor([[crop[0], crop[1], crop[0], crop[1]] for crop in slicer.bbox_crops], dtype=torch.float32) all_bboxes = [] all_labels = [] with torch.set_grad_enabled(False): for patch, patch_loc in DataLoader(list(zip(patches, offsets)), batch_size=8, pin_memory=True): patch = patch.to(self.fpn.conv1.weight.device) bboxes, labels = self(patch) all_bboxes.extend(bboxes.cpu()) all_labels.extend(labels.cpu()) boxes, labels, scores = self.box_coder.decode_multi( all_bboxes, all_labels, offsets) return to_numpy(boxes), to_numpy(labels), to_numpy(scores)
class MobilenetSSD512(nn.Module): num_anchors = 9 def __init__(self, num_classes, **kwargs): super(MobilenetSSD512, self).__init__() encoder = MobileNetV2() self.layer0 = encoder.layer0 self.layer1 = encoder.layer1 self.layer2 = encoder.layer2 self.layer3 = encoder.layer3 self.layer4 = encoder.layer4 self.layer5 = encoder.layer5 self.layer6 = encoder.layer6 self.layer7 = encoder.layer7 self.conv6 = nn.Conv2d(320, 64, kernel_size=3, stride=2, padding=1) self.conv7 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) self.conv8 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) self.conv9 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1) # Top-down layers self.toplayer = nn.Conv2d(320, 64, kernel_size=1, stride=1, padding=0) # Lateral layers self.latlayer1 = nn.Conv2d(96, 64, kernel_size=1, stride=1, padding=0) self.latlayer2 = nn.Conv2d(32, 64, kernel_size=1, stride=1, padding=0) # Smooth layers self.smooth1 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) self.smooth2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) self.num_classes = num_classes + 1 # Dummy class self.loc_head = self._make_head(self.num_anchors * 4) self.cls_head = self._make_head(self.num_anchors * self.num_classes) self.box_coder = FPNSSDBoxCoder() def forward(self, image): # Extract features c0 = self.layer0(image) c1 = self.layer1(c0) c2 = self.layer2(c1) c3 = self.layer3(c2) c4 = self.layer4(c3) c5 = self.layer5(c4) c6 = self.layer6(c5) c7 = self.layer7(c6) # print(c0.size()) # print(c2.size()) # print(c3.size()) # print(c4.size()) # print(c5.size()) # print(c6.size()) # print(c7.size()) p6 = self.conv6(c7) p7 = self.conv7(F.relu(p6)) p8 = self.conv8(F.relu(p7)) p9 = self.conv9(F.relu(p8)) # Top-down p5 = self.toplayer(c7) p4 = self._upsample_add(p5, self.latlayer1(c5)) p3 = self._upsample_add(p4, self.latlayer2(c3)) p4 = self.smooth1(p4) p3 = self.smooth2(p3) features = [p3, p4, p5, p6, p7, p8, p9] loc_preds = [] cls_preds = [] for fm in features: loc_pred = self.loc_head(fm) cls_pred = self.cls_head(fm) loc_pred = loc_pred.permute(0, 2, 3, 1).contiguous().view( image.size(0), -1, 4) # [N, 9*4,H,W] -> [N,H,W, 9*4] -> [N,H*W*9, 4] cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view( image.size(0), -1, self.num_classes ) # [N,9*NC,H,W] -> [N,H,W,9*NC] -> [N,H*W*9,NC] loc_preds.append(loc_pred) cls_preds.append(cls_pred) bboxes = torch.cat(loc_preds, 1) labels = torch.cat(cls_preds, 1) return bboxes, labels def _upsample_add(self, x, y, scale_factor=2): '''Upsample and add two feature maps. Args: x: (Variable) top feature map to be upsampled. y: (Variable) lateral feature map. Returns: (Variable) added feature map. Note in PyTorch, when input size is odd, the upsampled feature map with `F.upsample(..., scale_factor=2, mode='nearest')` maybe not equal to the lateral feature map size. e.g. original input size: [N,_,15,15] -> conv2d feature map size: [N,_,8,8] -> upsampled feature map size: [N,_,16,16] So we choose bilinear upsample which supports arbitrary output sizes. ''' _, _, H, W = y.size() # print(x.size(), y.size()) return F.interpolate(x, scale_factor=2, mode='nearest') + y def _make_head(self, out_planes): layers = [] for _ in range(4): layers.append(nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)) layers.append(nn.ReLU(True)) layers.append( nn.Conv2d(64, out_planes, kernel_size=3, stride=1, padding=1)) return nn.Sequential(*layers) def predict(self, image): import albumentations as A self.eval() normalize = A.Normalize() image = normalize(image=image)['image'] slicer = ImageSlicer(image.shape, 512, 512 // 2) patches = [ tensor_from_rgb_image(patch) for patch in slicer.split(image, borderType=cv2.BORDER_CONSTANT) ] offsets = torch.tensor([[crop[0], crop[1], crop[0], crop[1]] for crop in slicer.bbox_crops], dtype=torch.float32) all_bboxes = [] all_labels = [] with torch.set_grad_enabled(False): for patch, patch_loc in DataLoader(list(zip(patches, offsets)), batch_size=8, pin_memory=True): patch = patch.to(self.conv6.weight.device) bboxes, labels = self(patch) all_bboxes.extend(bboxes.cpu()) all_labels.extend(labels.cpu()) boxes, labels, scores = self.box_coder.decode_multi( all_bboxes, all_labels, offsets) return to_numpy(boxes), to_numpy(labels), to_numpy(scores)
def __init__(self, threshold=0.5): self.scores_per_image = [] self.threshold = threshold self.box_coder = FPNSSDBoxCoder()