Exemplo n.º 1
0
    def forward(self, img_size, x, rois, roi_indices):
        """Forward the chain.

        We assume that there are :math:`N` batches.

        Args:
            x (Variable): 4D image variable.
            rois (Tensor): A bounding box array containing coordinates of
                proposal boxes.  This is a concatenation of bounding box
                arrays from multiple images in the batch.
                Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
                RoIs from the :math:`i` th image,
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            roi_indices (Tensor): An array containing indices of images to
                which bounding boxes correspond to. Its shape is :math:`(R',)`.

        """
        # in case roi_indices is  ndarray
        img_h, img_w = img_size
        # size of rois in the input images. (h, w)
        roi_size = np.concatenate(
            (np.expand_dims(at.tonumpy(rois[:, 2] - rois[:, 0]), axis=1),
             (np.expand_dims(at.tonumpy(rois[:, 3] - rois[:, 1]), axis=1))),
            axis=1)
        feature_h, feature_w = x.shape[2], x.shape[3]
        roi_indices = at.totensor(roi_indices).int()
        rois = at.totensor(rois).float()
        rois[:, 0] = rois[:, 0] / img_h * feature_h
        rois[:, 2] = rois[:, 2] / img_h * feature_h
        rois[:, 1] = rois[:, 1] / img_w * feature_w
        rois[:, 3] = rois[:, 3] / img_w * feature_w
        # pool = self.roi(x, indices_and_rois)
        rois = at.tovariable(rois)
        roi_indices = at.tovariable(roi_indices)
        pool = self.roi(x, rois, roi_indices)  # (128, 512, 7, 7)
        pool = pool.view(pool.size(0), -1)
        fc7 = self.classifier(pool)
        roi_cls_locs = self.cls_loc(fc7)
        roi_scores = self.score(fc7)
        return roi_cls_locs, roi_scores
Exemplo n.º 2
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,
            gt_rpn_loc,
            gt_rpn_label.data,
            self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Exemplo n.º 3
0
    def predict(self, imgs, sizes=None, visualize=False):
        """Detect objects from images.
        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: (R, 4)
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
           * **labels** : 
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : 
               Each value indicates how confident the prediction is.

        """
        self.eval()
        if visualize:
            self.use_preset('visualize')  # 本来是 visualize   evaluate
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None],
                                      volatile=True)
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(
                img, scale=scale)  # 这里调用了 forward 方法

            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(
                at.tonumpy(roi).reshape((-1, 4)),
                at.tonumpy(roi_cls_loc).reshape((-1, 4)))

            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score),
                                        dim=1))  # 有趣 打出来的分,softmax后变成概率值
            # 可以在这里看一下预测出来的最大概率是多少,如果太小就直接return出去,下面都不用跑了
            # prob 是 300 x 21 的尺寸, np.sum(prob) = 300

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Exemplo n.º 4
0
    def forward(self, imgs, bboxes, labels, scale):
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        # feature extraction
        features = self.faster_rcnn.extractor(imgs)

        # RPN network
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)

        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))

        # Faster rcnn head
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(),
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(),
                                           gt_roi_loc.float(),
                                           gt_roi_label.data, self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Exemplo n.º 5
0
    def predict(self, imgs,sizes=None,visualize=False):
        """Detect objects from images.
        从图像中检测物体

        This method predicts objects for each image.
          此方法预测每个图像的对象。
        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        #将模块设置为评估模式。这只对诸如Dropout或BatchNorm等模块有任何影响。module中的方法
        self.eval()
        #可视化
        if visualize:
            #设置为可视化  设置 self.nms_thresh = 0.3   self.score_thresh = 0.7
            #评估模式 和 可视化模式 使用不同的nms最大化抑制 和阈值
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                # print('nei img shape is ', img.shape)
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list()
        labels = list()
        scores = list()
        #size[600,800]
        # print('sizes is ', sizes)
        for img, size in zip(prepared_imgs, sizes):
            #img由[3,600,800]转为[1,3,600,800]  转为变量,扩充一维 并设置为 预测模式
            img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
            #scale 为1
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Exemplo n.º 6
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.
        Faster网络的前向传播、计算losses*************************
        Here are notations used.

        * :math:`N` is the batch size. `N`是批量大小
        * :math:`R` is the number of bounding boxes per image. `R`是每个图像的边界框的数量

        Currently, only :math:`N=1` is supported.
        当前模型,只有N=1可用

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
                                            batch=1的图片变量
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
                                            真实人工标注的bboxes变量
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`.
                 The background is excluded from the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes.
                 背景被排除在定义之外,这意味着值的范围。`L`是前景类的数量
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.
                预处理期间应用于原始图像的缩放量

        Returns:
            namedtuple of 5 losses
            五个损失
        """

        n = bboxes.shape[0]
        #判断,只支持batch为1
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')
        #img_size=原图像的高、宽
        _, _, H, W = imgs.shape
        img_size = (H, W)
        #通过提取器(预训练好的VGG16)网络提取特征
        features = self.faster_rcnn.extractor(imgs)
        #通过rpn网络(区域提案网络)得到
        #rpn这是一个区域提案网络。它提取图像特征,预测输出rois
        #rpn_locs[1,17316,4]   rpn_scores[1,17316,2]   rois[2000,4]   roi_indices[2000,]全为0  anchor [17316,4]
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # 由于批量大小为1,因此将变量转换为单数形式(即压缩第一维)
        #bbox变为[1,4]
        bbox = bboxes[0]
        label = labels[0]
        #则rpn_score变为[17316,4]  rpn_loc 变为[17316,2]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        #大约2000个rois
        roi = rois

        # Sample RoIs and forward   简单的ROIs和前向传播
        # it's fine to break the computation graph of rois, consider them as constant input
        #打破rois的计算图,将它作为一个固定不变的输入
        #proposal_target_creator  输入为rois(2000个候选框,和人工标注的bbox)用于生成训练目标,只训练用到
        #2000个rois选出128个
        #sample_roi[128,4]     gt_roi_loc[128,4]     gt_roi_label[128,] 值为0或1 表示正负样本
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now
        #它全部为零,因为现在它只支持batch = 1
        sample_roi_index = t.zeros(len(sample_roi))
        #roi head网络进行预测类别和目标框
        #RoIHead: 负责对rois分类和微调。对RPN找出的rois,判断它是否包含目标,并修正框的位置和座标
        #使用RoIs提议的的feature maps,对RoI中的对象进行分类并提高目标框定位
        #roi_cls_loc  roi的分类、回归
        #传入  特征提取的features   和  128个ROI
        #roi_cls_loc [128,84]回归定位    roi_score[128,21]分类(20类加背景)
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        #真实标注的bbox,预测出来的anchor锚点
        # 将真实的bbox分配给锚点,返回 经过rpn后对应的定位和标签
        #gt_rpn_loc[17316,4]     gt_rpn_label  [17316,]
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        #转为变量V  转为long型
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        #rpn的回归定位损失   rpn_loc_loss[1]
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        #ignore_index的默认值是 - 100...
        #F:pytorch的function
        #分类使用交叉熵
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)

        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        #添加进rpn 混淆矩阵
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        #roi分类和回归   压缩第一维
        #n_sample 128
        n_sample = roi_cls_loc.shape[0]
        #改变形状为[ 32,4]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        #得到roi的回归
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        # gt_roi_label:真实roi的标签
        #gt_roi_loc:真实roi的回归
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)
        #roi的回归损失  计算回归定位的损失
        roi_loc_loss = _fast_rcnn_loc_loss(
            #contiguous从不连续调整为连续
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)
        #roi分类损失(交叉熵)
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
        #添加进roi 混淆矩阵
        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())
        #计算总损失
        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]
        #返回Tuple,四个损失+总损失
        return LossTuple(*losses)
Exemplo n.º 7
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]  # number of input images one time
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape  # should be (1,3,H,W)
        img_size = (H, W)

        # need more feature maps here when you are trying to use features of different scale
        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn(
            features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # different parameters here :
        #     num_boxes : number of ground truth bounding boxes in a image.
        #     num_anchors : number of anchors in images(or to say in a feature map).
        #     num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN.
        bbox = bboxes[0]  # shape (num_boxes, 4)
        label = labels[0]  # shape (num_boxes,)
        rpn_score = rpn_scores[0]  # shape (num_anchors,)
        rpn_loc = rpn_locs[0]  # shape (num_anchors, 4)
        roi = rois  # shape (num_rois, 4)
        search_region = search_regions  # shape (num_rois, 4)

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois,
        # consider them as constant input
        sample_roi, sample_search_region, (
            Tx, Ty), gt_roi_label = self.proposal_target_creator(
                roi, search_region, at.tonumpy(bbox), at.tonumpy(label))

        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        (px, py), roi_score = self.faster_rcnn.head(features, sample_roi,
                                                    sample_search_region,
                                                    sample_roi_index)

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = px.shape[0]
        # (px, py) and (Tx, Ty) are to be used to caculate loss :roi_loc_loss

        Tx = at.tovariable(Tx).float()
        Ty = at.tovariable(Ty).float()

        print("px is ", px)
        # print("max of px is ", t.max(px))
        # print("min of px is ", t.min(px))
        # print(t.max(Tx))
        # print(t.max(Ty))
        # print(Tx.shape, Ty.shape, px.shape, py.shape)

        roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data,
                                    self.roi_sigma)

        gt_roi_label = at.tovariable(gt_roi_label).long()
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]

        print("losses", losses)

        losses = losses + [sum(losses)]

        return LossTuple(*losses)  # return a namedtuple
    def predict(self, imgs, sizes=None, visualize=False):
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None],
                                      requires_grad=True)
            with t.no_grad():
                scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(
                at.tonumpy(roi).reshape((-1, 4)),
                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Exemplo n.º 9
0
    def predict(self, imgs, sizes=None, visualize=False, prob_thre=0.7):
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval()

        # sizes changes when visualize is set to different values
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]  # reshaped image size
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs

        bboxes = list()
        labels = list()
        scores = list()

        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None],
                                      volatile=True)

            # judge and change type if necessary
            if t.is_tensor(size[1]):
                size[1] = int(size[1])

            if t.is_tensor(img.shape[3]):
                img.shape[3] = int(img.shape[3])

            scale = img.shape[3] / size[1]

            (px, py), roi_scores, rois, search_regions, _ = self(img,
                                                                 scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            px = px.data
            py = py.data

            roi = at.totensor(rois) / scale
            search_regions = at.totensor(search_regions) / scale

            # Convert to numpy array
            px = at.tonumpy(px)
            py = at.tonumpy(py)
            search_regions = at.tonumpy(search_regions)

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.

            # use px, py and search_regions to generate boxes
            cls_bbox = p2bbox(px, py, search_regions, threshold=prob_thre)
            cls_bbox = at.totensor(cls_bbox)

            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            # print("raw_cls_bbox shape : ", raw_cls_bbox.shape)
            # print("raw_prob : ", raw_prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)

            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()

        return bboxes, labels, scores
Exemplo n.º 10
0
    def forward(self, imgs, bboxes, labels, scale):
        """
        Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        #print(bboxes)
        
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        #print(gt_roi_label)
        #print('got region proposals')
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)

        # ------------------ RPN losses -------------------#
        n_bbox = bbox.shape
        if len(n_bbox) > 0:
            n_bbox = n_bbox[0]
        if n_bbox > 0:
            gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
                at.tonumpy(bbox),
                anchor,
                img_size)
            #print(gt_rpn_label.shape)
            #print(gt_rpn_label)
            #print(anchor.shape)
            #print(sample_roi.shape)
            #print('got anchor targets')
            gt_rpn_label = at.tovariable(gt_rpn_label).long()
            gt_rpn_loc = at.tovariable(gt_rpn_loc)
            rpn_loc_loss = _fast_rcnn_loc_loss(
                rpn_loc,
                gt_rpn_loc,
                gt_rpn_label.data,
                self.rpn_sigma)
            #print(rpn_loc_loss)
        else: #if no bboxes, should have no rpn loc loss
            rpn_loc_loss = t.tensor(0.)
            if opt.use_cuda:
                rpn_loc_loss = rpn_loc_loss.cuda()
        #print('got rpn loc loss')
        
        # if no bboxes, all region labels are 0 (background)
  
        if n_bbox == 0:
            gt_rpn_label = t.tensor([0 for i in range(anchor.shape[0])])
        # NOTE: default value of ignore_index is -100 ...
        fg_bg_count = np.unique(gt_rpn_label.detach().cpu(), return_counts=True)[1][1:]
        if opt.reduce_bg_weight:
            # Reweight foreground / background for the case we couldn't sample identical numbers
            rpn_class_weights = 1.0 / fg_bg_count
            rpn_class_weights = t.FloatTensor(rpn_class_weights / np.sum(rpn_class_weights) * 2)
        else:
            rpn_class_weights = None
        if opt.use_cuda:
            rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1,
                                            weight=rpn_class_weights.cuda() if rpn_class_weights is not None else None)
        else:
            rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1, weight=rpn_class_weights)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())
        #print('got rpn class loss')

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        #print(n_sample, gt_roi_label.shape, sample_roi.shape)
        if opt.use_cuda:
            roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()]
        else:
            roi_loc = roi_cls_loc[t.arange(0, n_sample).long(), at.totensor(gt_roi_label).long()]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        if n_bbox > 0:
            roi_loc_loss = _fast_rcnn_loc_loss(
                roi_loc.contiguous(),
                gt_roi_loc,
                gt_roi_label.data,
                self.roi_sigma)
        else: #no roi loc loss if no gt bboxes
            roi_loc_loss = t.tensor(0.)
            if opt.use_cuda:
                roi_loc_loss = roi_loc_loss.cuda()
        #print('got roi loc loss')

        if opt.reduce_bg_weight:
            bg_weight = 1.0 / gt_roi_label.size()[0]
            class_weights = t.FloatTensor(np.hstack([bg_weight, np.ones((self.n_fg_class,))]))
        else:
            class_weights = None

        if opt.use_cuda:
            roi_cls_loss = nn.CrossEntropyLoss(weight=class_weights.cuda() if 
                            class_weights is not None else None)(roi_score, gt_roi_label.cuda())
        else:
            roi_cls_loss = nn.CrossEntropyLoss(weight=class_weights)(roi_score, gt_roi_label)

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())
        #print('got roi class loss')
        
        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        #print(losses)
        sum_losses = sum(losses)
        #print(sum_losses.type)
        losses = losses + [sum_losses]

        return LossTuple(*losses)
Exemplo n.º 11
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(features, img_size, scale)
        """
        rpn_locs.shape, rpn_scores.shape, rois.shape, roi_indices.shape, anchor.shape = 
        (torch.Size([1, 18648, 4]),
         torch.Size([1, 18648, 2]),
         (1714, 4),
         (1714,),
         (18648, 4))
         rpn网络做的事情是:
         对于每张图片,利用它的feature map, 
        计算 (H/16)× (W/16)×9(大概20000)个anchor属于前景或背景的概率(rpn_scores),
        以及对应的网络预测的需要修正的位置参数(rpn_locs)。
        然后,对于每张图片,根据前面算出来的前景的概率(rpn_fg_scores),
        选取概率较大的12000个anchor,
        利用回归的位置参数(rpn_locs),修正这12000个anchor的位置,得到RoIs
        利用非极大值((Non-maximum suppression, NMS)抑制,选出概率最大的2000个RoIs
            
        注意:在inference的时候,为了提高处理速度,12000和2000分别变为6000和300.          
        """
        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois
        """
        bbox.shape,label.shape,rpn_score.shape,rpn_loc.shape,roi.shape = 
        (torch.Size([2, 4]),
         torch.Size([2]),
         torch.Size([16650, 2]),
         torch.Size([16650, 4]),
         (2000, 4))
        """

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox), # at = array_tools,tensor to numpy 用不着了,在pytorch0.4里
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)
        """
        sample_roi.shape, gt_roi_loc.shape, gt_roi_label.shape = 
        ((128, 4), (128, 4), (128,))
        proposal_target_creator的作用是:
        RPN会产生大约2000个RoIs,这2000个RoIs不是都拿去训练,
        而是利用ProposalTargetCreator 选择128个RoIs用以训练。选择的规则如下:
        RoIs和gt_bboxes 的IoU大于0.5的,选择一些(比如32个)
        选择 RoIs和gt_bboxes的IoU小于0.5,同时大于等于0(或者0.1)的选择一些(比如 128-32=96个)作为负样本
        为了便于训练,对选择出的128个RoIs,还对他们的gt_roi_loc 进行标准化处理(减去均值除以标准差)
        最终输出128个roi框及其分别对应的需要修正的[ty,tx,th,tw]和label
        """
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)
        """
        x.shape, rois.shape, roi_indices.shape = (torch.Size([1, 512, 37, 56]), (128, 4), torch.Size([128]))
        ROIHEAD做的事情是根据前面得到的128个roi框,
        去feature上分别做roi pool,
        得到[128,512,7,7]的最终信息
        相当于每一个roi框,不管他有多大,
        统统roi pool到[512,7,7]
        再然后就是几个linear layer,
        从512*7*7 = 25088 得到 21维的class score 和 84维的roi_cls_locs
        最终输出是[128, 84],[128, 21]
        """
        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)
        """
        所以总的来说,AnchorTargetCreator做的事情是:
        根据每一个预先设定的anchor和这张图片的gt_bbox去计算iou,
        再用求得的iou来给每一个anchor打标签,
        1是正样本,0是负样本,-1表示不关心,不参与后续计算
        打标签是通过
        正负样本之和应该是self.n_sample,比例是self.pos_ratio
        打标签的依据是:
        1. iou < 0.3的都算负样本
        2. 对每一个gt_object,标记和它iou最高的的anchor为正样本
            可能同时有多个anchor同时iou最高(相等)
        3. 剩下的anchor里面,iou大于0.7的也算正样本
        4. 还要平衡一下正负样本的数量和比例
        
        它不但打标签,还会计算每一个anchor和它最匹配的gt_bbox的loc,
        用于后续的bbox回归loss计算
        最后,返回的是loc和label # ((16650,), (16650, 4))
        """
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)
        # gt_rpn_loc.shape, gt_rpn_label.shape : ((18648, 4), (18648,))
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,
            gt_rpn_loc,
            gt_rpn_label.data,
            self.rpn_sigma) # loss value

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
        """
        rpn_score.shape,gt_rpn_label.shape : (torch.Size([15318, 2]), torch.Size([15318]))
        
         ignore_index (int, optional): Specifies a target value that is ignored
            and does not contribute to the input gradient. When :attr:`size_average` is
            ``True``, the loss is averaged over non-ignored targets. Default: -100
        """
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        # _gt_rpn_label.shape,_rpn_score.shape : (torch.Size([256]), (256, 2))
        
#         self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0] # 128
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) 
        #torch.Size([128, 84]) to torch.Size([128, 21, 4])
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        # 21个class的loc,取对应的gt制定的那个,即gt_roi_label
        # torch.Size([128, 4]) 
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

#         self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
    def predict(self, imgs,sizes=None,visualize=False):
        """Detect objects from images.

        This method predicts objects for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images.
                All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(bboxes, labels, scores)`.

           * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
               where :math:`R` is the number of bounding boxes in a image. \
               Each bouding box is organized by \
               :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
               in the second axis.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the bounding box. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """
        self.eval()
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
             prepared_imgs = imgs 
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True)
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Exemplo n.º 13
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)

        # 20000->12000->2000 rois
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(
            features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # ProposalTargetCreator
        # 在训练RoIHead/Fast R-CNN的时候,从 2000 个 rois 中选择 128 个用以训练。
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)

        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        # ------------------ RPN losses -------------------#
        #        AnchorTargetCreator
        #        训练RPN的时候,从20000个anchor中选择256个进行训练,
        #        以使得正负样本比例大概是1:1. 同时给出训练的 位置参数目标。

        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)

        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)

        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label,
                                       ignore_index=-1)

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)

        roi_loc = roi_cls_loc[t.arange(0, n_sample).long(),
                              at.totensor(gt_roi_label).long()]

        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc,
                                           gt_roi_label.data, self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label)

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]

        # 没有显示平均误差
        print('rpnLoc:', float(rpn_loc_loss.data.numpy()), ' rpnCls:',
              float(rpn_cls_loss.data.numpy()), ' roiLoc:',
              float(rpn_loc_loss.data.numpy()), ' roiCls:',
              float(roi_cls_loss.data.numpy()))
        losses = losses + [sum(losses)]

        return LossTuple(*losses)
Exemplo n.º 14
0
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
                                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores

    def get_optimizer(self):
        """
Exemplo n.º 15
0
    def predict(self, imgs, sizes=None, visualize=False):
        """

        Detect objects from images.
        This method predicts objects for each image.

        """
        self.eval()
        self.use_preset('evaluate')
        if visualize:
            self.use_preset('visualize')
            prepared_imgs = list()
            sizes = list()
            for img in imgs:
                size = img.shape[1:]
                img = preprocess(at.tonumpy(img))
                prepared_imgs.append(img)
                sizes.append(size)
        else:
            prepared_imgs = imgs
        bboxes = list()
        labels = list()
        scores = list()
        for img, size in zip(prepared_imgs, sizes):
            img = t.autograd.Variable(at.totensor(img).float()[None],
                                      volatile=True)
            scale = img.shape[3] / size[1]
            roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
            # We are assuming that batch size is 1.
            roi_score = roi_scores.data
            roi_cls_loc = roi_cls_loc.data
            roi = at.totensor(rois) / scale

            # Convert predictions to bounding boxes in image coordinates.
            # Bounding boxes are scaled to the scale of the input images.
            mean = t.Tensor(self.loc_normalize_mean).cuda(). \
                repeat(self.n_class)[None]
            std = t.Tensor(self.loc_normalize_std).cuda(). \
                repeat(self.n_class)[None]

            roi_cls_loc = (roi_cls_loc * std + mean)
            roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
            roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
            cls_bbox = loc2bbox(
                at.tonumpy(roi).reshape((-1, 4)),
                at.tonumpy(roi_cls_loc).reshape((-1, 4)))
            cls_bbox = at.totensor(cls_bbox)
            cls_bbox = cls_bbox.view(-1, self.n_class * 4)
            # clip bounding box
            cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
            cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])

            prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1))

            raw_cls_bbox = at.tonumpy(cls_bbox)
            raw_prob = at.tonumpy(prob)

            bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
            bboxes.append(bbox)
            labels.append(label)
            scores.append(score)

        self.use_preset('evaluate')
        self.train()
        return bboxes, labels, scores
Exemplo n.º 16
0
    def forward(self, imgs, bboxes, labels, scale):
        """Forward Faster R-CNN and calculate losses.

        Here are notations used.

        * :math:`N` is the batch size.
        * :math:`R` is the number of bounding boxes per image.

        Currently, only :math:`N=1` is supported.

        Args:
            imgs (~torch.autograd.Variable): A variable with a batch of images.
            bboxes (~torch.autograd.Variable): A batch of bounding boxes.
                Its shape is :math:`(N, R, 4)`.
            labels (~torch.autograd..Variable): A batch of labels.
                Its shape is :math:`(N, R)`. The background is excluded from
                the definition, which means that the range of the value
                is :math:`[0, L - 1]`. :math:`L` is the number of foreground
                classes.
            scale (float): Amount of scaling applied to
                the raw image during preprocessing.

        Returns:
            namedtuple of 5 losses
        """
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')

        _, _, H, W = imgs.shape
        img_size = (H, W)

        # extractor在这里是VGG16的前10层,通过extractor可以提取feature_map
        features = self.faster_rcnn.extractor(imgs)

        # ------------------ RPN Network -------------------#
        # ------------------ RPN 预测 -------------------#
        # 通过RPN网络提取roi
        # rpn_locs:每个anchor的修正量,[1,9*hh*ww,4]
        # rpn_scores:每个anchor的二分类(是否为物体)得分,[1,9*hh*ww,2]
        # rois:通过rpn网络获得的ROI(候选区),训练时约2000个,[2000,4]
        # roi_indeces:不太懂,[0,0..0,0]?,长度和rois的个数一样,后面也根本没有用到
        # -解答-:全0是因为只支持batch size=1,这个index相当于在batch里的索引
        # rpn_locs和rpn_scores是用于训练时计算loss的,rois是给下面rcnn网络用来分类的
        # 注意,这里对每个anchor都进行了位置和分类的预测,也就是对9*hh*ww个anchor都进行了预测
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(
            features, img_size, scale)

        # Since batch size is one, convert variables to singular form
        # 因为这里只支持BatchSize=1,所以直接提取出来
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]  # [n_anchor,2]
        rpn_loc = rpn_locs[0]  # [n_anchor,4]
        roi = rois

        # ------------------ RPN 标注 -------------------#
        # 因为RPN网络对所有的(9*hh*ww)个anchor都进行了预测,所以这里的gt_rpn_loc, gt_rpn_label应该包含所有anchor的对应值
        # 但是在真实计算中只采样了一定的正负样本共256个用于计算loss
        # 这里的做法:正样本label=1,负样本label=0,不合法和要忽略的样本label=-1,在计算loss时加权区分
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox), anchor, img_size)
        gt_rpn_label = at.tovariable(gt_rpn_label).long()
        gt_rpn_loc = at.tovariable(gt_rpn_loc)

        # ------------------ RPN losses 计算 -------------------#
        # loc loss(位置回归loss)
        # loc的loss只计算正样本的
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc,
                                           gt_rpn_label.data, self.rpn_sigma)

        # cls loss(分类loss,这里只分两类)
        # label=-1的样本被忽略
        rpn_cls_loss = F.cross_entropy(rpn_score,
                                       gt_rpn_label.cuda(),
                                       ignore_index=-1)
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False),
                        _gt_rpn_label.data.long())

        # ------------------ ROI Nework -------------------#
        # ------------------ ROI 标注 -------------------#
        # Sample RoIs and forward
        # it's fine to break the computation graph of rois,
        # consider them as constant input
        # 在roi中采样一定数量的正负样本,给ROIHead(rcnn)网络用于训练分类
        # gt_roi_loc:位置修正量,这里就是第二次对位置进行回归修正
        # gt_roi_label:N+1类,多了一个背景类(是不是物体)
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean,
            self.loc_normalize_std)
        # NOTE it's all zero because now it only support for batch=1 now(这里解释了上面的疑问)
        sample_roi_index = t.zeros(len(sample_roi))

        # ------------------ ROI 预测 -------------------#
        # 这里不需要对所有的ROI进行预测,所以在标注阶段确定了样本之后再进行预测
        # 得到候选区域sample_roi的预测分类roi_score和预测位置修正量roi_cls_loc
        roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi,
                                                       sample_roi_index)

        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.view(n_sample, -1,
                                       4)  # [n_sample, n_class+1, 4]
        # roi_cls_loc得到的是对每个类的坐标的预测,但是真正的loss计算只需要在ground truth上的类的位置预测
        # roi_loc就是在ground truth上的类的位置预测
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(),
                              at.totensor(gt_roi_label).long()]  # [m_sample.4]
        gt_roi_label = at.tovariable(gt_roi_label).long()
        gt_roi_loc = at.tovariable(gt_roi_loc)

        # loc loss(位置回归loss)
        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc,
                                           gt_roi_label.data, self.roi_sigma)

        # cls loss(分类loss,这里分21类)
        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())

        self.roi_cm.add(at.totensor(roi_score, False),
                        gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
        losses = losses + [sum(losses)]

        return LossTuple(*losses)