Exemplo n.º 1
0
def decode_results(predictions):
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    ploc, plabel = [val.float() for val in predictions]
    results = encoder.decode_batch(ploc, plabel, criteria=0.5, max_output=20)

    return [[pred.detach().cpu().numpy() for pred in detections]
            for detections in results]
Exemplo n.º 2
0
def test(opt):
    model = SSD(backbone=ResNet())
    checkpoint = torch.load(opt.pretrained_model)
    model.load_state_dict(checkpoint["model_state_dict"])
    if torch.cuda.is_available():
        model.cuda()
    model.eval()
    dboxes = generate_dboxes()
    test_set = CocoDataset(opt.data_path, 2017, "val",
                           SSDTransformer(dboxes, (300, 300), val=True))
    encoder = Encoder(dboxes)

    if os.path.isdir(opt.output):
        shutil.rmtree(opt.output)
    os.makedirs(opt.output)

    for img, img_id, img_size, _, _ in test_set:
        if img is None:
            continue
        if torch.cuda.is_available():
            img = img.cuda()
        with torch.no_grad():
            ploc, plabel = model(img.unsqueeze(dim=0))
            result = encoder.decode_batch(ploc, plabel, opt.nms_threshold,
                                          20)[0]
            loc, label, prob = [r.cpu().numpy() for r in result]
            best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1)
            loc = loc[best]
            label = label[best]
            prob = prob[best]
            if len(loc) > 0:
                path = test_set.coco.loadImgs(img_id)[0]["file_name"]
                output_img = cv2.imread(
                    os.path.join(opt.data_path, "val2017", path))
                height, width, _ = output_img.shape
                loc[:, 0::2] *= width
                loc[:, 1::2] *= height
                loc = loc.astype(np.int32)
                for box, lb, pr in zip(loc, label, prob):
                    category = test_set.label_info[lb]
                    color = colors[lb]
                    xmin, ymin, xmax, ymax = box
                    cv2.rectangle(output_img, (xmin, ymin), (xmax, ymax),
                                  color, 2)
                    text_size = cv2.getTextSize(category + " : %.2f" % pr,
                                                cv2.FONT_HERSHEY_PLAIN, 1,
                                                1)[0]
                    cv2.rectangle(
                        output_img, (xmin, ymin),
                        (xmin + text_size[0] + 3, ymin + text_size[1] + 4),
                        color, -1)
                    cv2.putText(output_img, category + " : %.2f" % pr,
                                (xmin, ymin + text_size[1] + 4),
                                cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
                    cv2.imwrite(
                        "{}/{}_prediction.jpg".format(opt.output, path[:-4]),
                        output_img)
Exemplo n.º 3
0
def test(opt):
    model = SSD(backbone=ResNet())
    checkpoint = torch.load(opt.pretrained_model)
    model.load_state_dict(checkpoint["model_state_dict"])
    if torch.cuda.is_available():
        model.cuda()
    model.eval()
    dboxes = generate_dboxes()
    transformer = SSDTransformer(dboxes, (300, 300), val=True)
    img = Image.open(opt.input).convert("RGB")
    img, _, _, _ = transformer(img, None, torch.zeros(1,4), torch.zeros(1))
    encoder = Encoder(dboxes)

    if torch.cuda.is_available():
        img = img.cuda()
    with torch.no_grad():
        ploc, plabel = model(img.unsqueeze(dim=0))
        result = encoder.decode_batch(ploc, plabel, opt.nms_threshold, 20)[0]
        loc, label, prob = [r.cpu().numpy() for r in result]
        best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1)
        loc = loc[best]
        label = label[best]
        prob = prob[best]
        output_img = cv2.imread(opt.input)
        if len(loc) > 0:
            height, width, _ = output_img.shape
            loc[:, 0::2] *= width
            loc[:, 1::2] *= height
            loc = loc.astype(np.int32)
            for box, lb, pr in zip(loc, label, prob):
                category = coco_classes[lb]
                color = colors[lb]
                xmin, ymin, xmax, ymax = box
                cv2.rectangle(output_img, (xmin, ymin), (xmax, ymax), color, 2)
                text_size = cv2.getTextSize(category + " : %.2f" % pr, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
                cv2.rectangle(output_img, (xmin, ymin), (xmin + text_size[0] + 3, ymin + text_size[1] + 4), color,
                              -1)
                cv2.putText(
                    output_img, category + " : %.2f" % pr,
                    (xmin, ymin + text_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1,
                    (255, 255, 255), 1)
        if opt.output is None:
            output = "{}_prediction.jpg".format(opt.input[:-4])
        else:
            output = opt.output
        cv2.imwrite(output, output_img)
class SSD300(nn.Module):
    def __init__(self, backbone=None, num_classes=21):
        super(SSD300, self).__init__()
        if backbone is None:
            raise Exception("backbone is None")
        if not hasattr(backbone, "out_channels"):
            raise Exception("the backbone not has attribute: out_channel")
        self.feature_extractor = backbone

        self.num_classes = num_classes
        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        self._build_additional_features(self.feature_extractor.out_channels)
        self.num_defaults = [4, 6, 6, 6, 4, 4]
        location_extractors = []
        confidence_extractors = []

        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
        for nd, oc in zip(self.num_defaults,
                          self.feature_extractor.out_channels):
            # nd is number_default_boxes, oc is output_channel
            location_extractors.append(
                nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
            confidence_extractors.append(
                nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))

        self.loc = nn.ModuleList(location_extractors)
        self.conf = nn.ModuleList(confidence_extractors)
        self._init_weights()

        default_box = dboxes300_coco()
        self.compute_loss = Loss(default_box)
        self.encoder = Encoder(default_box)

    def _build_additional_features(self, input_size):
        """
        为backbone(resnet50)添加额外的一系列卷积层,得到相应的一系列特征提取器
        :param input_size:
        :return:
        """
        additional_blocks = []
        # input_size = [1024, 512, 512, 256, 256, 256] for resnet50
        middle_channels = [256, 256, 128, 128, 128]
        for i, (input_ch, output_ch, middle_ch) in enumerate(
                zip(input_size[:-1], input_size[1:], middle_channels)):
            padding, stride = (1, 2) if i < 3 else (0, 1)
            layer = nn.Sequential(
                nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False),
                nn.BatchNorm2d(middle_ch),
                nn.ReLU(inplace=True),
                nn.Conv2d(middle_ch,
                          output_ch,
                          kernel_size=3,
                          padding=padding,
                          stride=stride,
                          bias=False),
                nn.BatchNorm2d(output_ch),
                nn.ReLU(inplace=True),
            )
            additional_blocks.append(layer)
        self.additional_blocks = nn.ModuleList(additional_blocks)

    def _init_weights(self):
        layers = [*self.additional_blocks, *self.loc, *self.conf]
        for layer in layers:
            for param in layer.parameters():
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)

    # Shape the classifier to the view of bboxes
    def bbox_view(self, features, loc_extractor, conf_extractor):
        locs = []
        confs = []
        for f, l, c in zip(features, loc_extractor, conf_extractor):
            # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1]
            locs.append(l(f).view(f.size(0), 4, -1))
            confs.append(c(f).view(f.size(0), self.num_classes, -1))

        locs, confs = torch.cat(locs,
                                2).contiguous(), torch.cat(confs,
                                                           2).contiguous()
        return locs, confs

    def forward(self, image, targets):
        x = self.feature_extractor(image)

        # Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256
        detection_features = [x]
        for layer in self.additional_blocks:
            x = layer(x)
            detection_features.append(x)

        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
        locs, confs = self.bbox_view(detection_features, self.loc, self.conf)

        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
        # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732

        if self.training:
            # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
            bboxes_out = targets['boxes']
            bboxes_out = bboxes_out.transpose(1, 2).contiguous()
            # print(bboxes_out.is_contiguous())
            labels_out = targets['labels']
            # print(labels_out.is_contiguous())

            # ploc, plabel, gloc, glabel
            loss = self.compute_loss(locs, confs, bboxes_out, labels_out)
            return {"total_losses": loss}

        # 将预测回归参数叠加到default box上得到最终预测box,并执行非极大值抑制虑除重叠框
        results = self.encoder.decode_batch(locs, confs)
        return results
Exemplo n.º 5
0
def test(opt):
    model = SSD(backbone=ResNet())
    checkpoint = torch.load(opt.pretrained_model)
    model.load_state_dict(checkpoint["model_state_dict"])
    if torch.cuda.is_available():
        model.cuda()
    model.eval()
    dboxes = generate_dboxes()
    transformer = SSDTransformer(dboxes, (300, 300), val=True)
    cap = cv2.VideoCapture(opt.input)
    if opt.output is None:
        output = "{}_prediction.mp4".format(opt.input[:-4])
    else:
        output = opt.output
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    out = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"MJPG"),
                          int(cap.get(cv2.CAP_PROP_FPS)), (width, height))
    encoder = Encoder(dboxes)
    while cap.isOpened():
        flag, frame = cap.read()
        output_frame = np.copy(frame)
        if flag:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        else:
            break
        frame = Image.fromarray(frame)
        frame, _, _, _ = transformer(frame, None, torch.zeros(1, 4),
                                     torch.zeros(1))

        if torch.cuda.is_available():
            frame = frame.cuda()
        with torch.no_grad():
            ploc, plabel = model(frame.unsqueeze(dim=0))
            result = encoder.decode_batch(ploc, plabel, opt.nms_threshold,
                                          20)[0]
            loc, label, prob = [r.cpu().numpy() for r in result]
            best = np.argwhere(prob > opt.cls_threshold).squeeze(axis=1)
            loc = loc[best]
            label = label[best]
            prob = prob[best]
            if len(loc) > 0:
                loc[:, 0::2] *= width
                loc[:, 1::2] *= height
                loc = loc.astype(np.int32)
                for box, lb, pr in zip(loc, label, prob):
                    category = coco_classes[lb]
                    color = colors[lb]
                    xmin, ymin, xmax, ymax = box
                    cv2.rectangle(output_frame, (xmin, ymin), (xmax, ymax),
                                  color, 2)
                    text_size = cv2.getTextSize(category + " : %.2f" % pr,
                                                cv2.FONT_HERSHEY_PLAIN, 1,
                                                1)[0]
                    cv2.rectangle(
                        output_frame, (xmin, ymin),
                        (xmin + text_size[0] + 3, ymin + text_size[1] + 4),
                        color, -1)
                    cv2.putText(output_frame, category + " : %.2f" % pr,
                                (xmin, ymin + text_size[1] + 4),
                                cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)

        out.write(output_frame)
    cap.release()
    out.release()