def forward(self, img_size, x, rois, roi_indices): """Forward the chain. We assume that there are :math:`N` batches. Args: x (Variable): 4D image variable. rois (Tensor): A bounding box array containing coordinates of proposal boxes. This is a concatenation of bounding box arrays from multiple images in the batch. Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed RoIs from the :math:`i` th image, :math:`R' = \\sum _{i=1} ^ N R_i`. roi_indices (Tensor): An array containing indices of images to which bounding boxes correspond to. Its shape is :math:`(R',)`. """ # in case roi_indices is ndarray img_h, img_w = img_size # size of rois in the input images. (h, w) roi_size = np.concatenate( (np.expand_dims(at.tonumpy(rois[:, 2] - rois[:, 0]), axis=1), (np.expand_dims(at.tonumpy(rois[:, 3] - rois[:, 1]), axis=1))), axis=1) feature_h, feature_w = x.shape[2], x.shape[3] roi_indices = at.totensor(roi_indices).int() rois = at.totensor(rois).float() rois[:, 0] = rois[:, 0] / img_h * feature_h rois[:, 2] = rois[:, 2] / img_h * feature_h rois[:, 1] = rois[:, 1] / img_w * feature_w rois[:, 3] = rois[:, 3] / img_w * feature_w # pool = self.roi(x, indices_and_rois) rois = at.tovariable(rois) roi_indices = at.tovariable(roi_indices) pool = self.roi(x, rois, roi_indices) # (128, 512, 7, 7) pool = pool.view(pool.size(0), -1) fc7 = self.classifier(pool) roi_cls_locs = self.cls_loc(fc7) roi_scores = self.score(fc7) return roi_cls_locs, roi_scores
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def predict(self, imgs, sizes=None, visualize=False): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: (R, 4) :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ * **labels** : Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') # 本来是 visualize evaluate prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self( img, scale=scale) # 这里调用了 forward 方法 # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox( at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) # 有趣 打出来的分,softmax后变成概率值 # 可以在这里看一下预测出来的最大概率是多少,如果太小就直接return出去,下面都不用跑了 # prob 是 300 x 21 的尺寸, np.sum(prob) = 300 raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) # feature extraction features = self.faster_rcnn.extractor(imgs) # RPN network rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) # Faster rcnn head roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc.float(), gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def predict(self, imgs,sizes=None,visualize=False): """Detect objects from images. 从图像中检测物体 This method predicts objects for each image. 此方法预测每个图像的对象。 Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ #将模块设置为评估模式。这只对诸如Dropout或BatchNorm等模块有任何影响。module中的方法 self.eval() #可视化 if visualize: #设置为可视化 设置 self.nms_thresh = 0.3 self.score_thresh = 0.7 #评估模式 和 可视化模式 使用不同的nms最大化抑制 和阈值 self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] # print('nei img shape is ', img.shape) img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() #size[600,800] # print('sizes is ', sizes) for img, size in zip(prepared_imgs, sizes): #img由[3,600,800]转为[1,3,600,800] 转为变量,扩充一维 并设置为 预测模式 img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) #scale 为1 scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Faster网络的前向传播、计算losses************************* Here are notations used. * :math:`N` is the batch size. `N`是批量大小 * :math:`R` is the number of bounding boxes per image. `R`是每个图像的边界框的数量 Currently, only :math:`N=1` is supported. 当前模型,只有N=1可用 Args: imgs (~torch.autograd.Variable): A variable with a batch of images. batch=1的图片变量 bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. 真实人工标注的bboxes变量 labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. 背景被排除在定义之外,这意味着值的范围。`L`是前景类的数量 scale (float): Amount of scaling applied to the raw image during preprocessing. 预处理期间应用于原始图像的缩放量 Returns: namedtuple of 5 losses 五个损失 """ n = bboxes.shape[0] #判断,只支持batch为1 if n != 1: raise ValueError('Currently only batch size 1 is supported.') #img_size=原图像的高、宽 _, _, H, W = imgs.shape img_size = (H, W) #通过提取器(预训练好的VGG16)网络提取特征 features = self.faster_rcnn.extractor(imgs) #通过rpn网络(区域提案网络)得到 #rpn这是一个区域提案网络。它提取图像特征,预测输出rois #rpn_locs[1,17316,4] rpn_scores[1,17316,2] rois[2000,4] roi_indices[2000,]全为0 anchor [17316,4] rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form # 由于批量大小为1,因此将变量转换为单数形式(即压缩第一维) #bbox变为[1,4] bbox = bboxes[0] label = labels[0] #则rpn_score变为[17316,4] rpn_loc 变为[17316,2] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] #大约2000个rois roi = rois # Sample RoIs and forward 简单的ROIs和前向传播 # it's fine to break the computation graph of rois, consider them as constant input #打破rois的计算图,将它作为一个固定不变的输入 #proposal_target_creator 输入为rois(2000个候选框,和人工标注的bbox)用于生成训练目标,只训练用到 #2000个rois选出128个 #sample_roi[128,4] gt_roi_loc[128,4] gt_roi_label[128,] 值为0或1 表示正负样本 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now #它全部为零,因为现在它只支持batch = 1 sample_roi_index = t.zeros(len(sample_roi)) #roi head网络进行预测类别和目标框 #RoIHead: 负责对rois分类和微调。对RPN找出的rois,判断它是否包含目标,并修正框的位置和座标 #使用RoIs提议的的feature maps,对RoI中的对象进行分类并提高目标框定位 #roi_cls_loc roi的分类、回归 #传入 特征提取的features 和 128个ROI #roi_cls_loc [128,84]回归定位 roi_score[128,21]分类(20类加背景) roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# #真实标注的bbox,预测出来的anchor锚点 # 将真实的bbox分配给锚点,返回 经过rpn后对应的定位和标签 #gt_rpn_loc[17316,4] gt_rpn_label [17316,] gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) #转为变量V 转为long型 gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) #rpn的回归定位损失 rpn_loc_loss[1] rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... #ignore_index的默认值是 - 100... #F:pytorch的function #分类使用交叉熵 rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] #添加进rpn 混淆矩阵 self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# #roi分类和回归 压缩第一维 #n_sample 128 n_sample = roi_cls_loc.shape[0] #改变形状为[ 32,4] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) #得到roi的回归 roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] # gt_roi_label:真实roi的标签 #gt_roi_loc:真实roi的回归 gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) #roi的回归损失 计算回归定位的损失 roi_loc_loss = _fast_rcnn_loc_loss( #contiguous从不连续调整为连续 roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) #roi分类损失(交叉熵) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) #添加进roi 混淆矩阵 self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) #计算总损失 losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] #返回Tuple,四个损失+总损失 return LossTuple(*losses)
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] # number of input images one time if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape # should be (1,3,H,W) img_size = (H, W) # need more feature maps here when you are trying to use features of different scale features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, search_regions, roi_indices, anchor = self.faster_rcnn.rpn( features, img_size, scale) # Since batch size is one, convert variables to singular form # different parameters here : # num_boxes : number of ground truth bounding boxes in a image. # num_anchors : number of anchors in images(or to say in a feature map). # num_rois : number of ROIs that are generated by RPN, which will be used in Fast RCNN. bbox = bboxes[0] # shape (num_boxes, 4) label = labels[0] # shape (num_boxes,) rpn_score = rpn_scores[0] # shape (num_anchors,) rpn_loc = rpn_locs[0] # shape (num_anchors, 4) roi = rois # shape (num_rois, 4) search_region = search_regions # shape (num_rois, 4) # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, sample_search_region, ( Tx, Ty), gt_roi_label = self.proposal_target_creator( roi, search_region, at.tonumpy(bbox), at.tonumpy(label)) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) (px, py), roi_score = self.faster_rcnn.head(features, sample_roi, sample_search_region, sample_roi_index) # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = px.shape[0] # (px, py) and (Tx, Ty) are to be used to caculate loss :roi_loc_loss Tx = at.tovariable(Tx).float() Ty = at.tovariable(Ty).float() print("px is ", px) # print("max of px is ", t.max(px)) # print("min of px is ", t.min(px)) # print(t.max(Tx)) # print(t.max(Ty)) # print(Tx.shape, Ty.shape, px.shape, py.shape) roi_loc_loss = _LocNet_loss(Tx, Ty, px, py, gt_roi_label.data, self.roi_sigma) gt_roi_label = at.tovariable(gt_roi_label).long() roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] print("losses", losses) losses = losses + [sum(losses)] return LossTuple(*losses) # return a namedtuple
def predict(self, imgs, sizes=None, visualize=False): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], requires_grad=True) with t.no_grad(): scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox( at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def predict(self, imgs, sizes=None, visualize=False, prob_thre=0.7): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() # sizes changes when visualize is set to different values if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] # reshaped image size img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) # judge and change type if necessary if t.is_tensor(size[1]): size[1] = int(size[1]) if t.is_tensor(img.shape[3]): img.shape[3] = int(img.shape[3]) scale = img.shape[3] / size[1] (px, py), roi_scores, rois, search_regions, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data px = px.data py = py.data roi = at.totensor(rois) / scale search_regions = at.totensor(search_regions) / scale # Convert to numpy array px = at.tonumpy(px) py = at.tonumpy(py) search_regions = at.tonumpy(search_regions) # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. # use px, py and search_regions to generate boxes cls_bbox = p2bbox(px, py, search_regions, threshold=prob_thre) cls_bbox = at.totensor(cls_bbox) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) # print("raw_cls_bbox shape : ", raw_cls_bbox.shape) # print("raw_prob : ", raw_prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): """ Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.faster_rcnn.rpn(features, img_size, scale) # Since batch size is one, convert variables to singular form #print(bboxes) bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) #print(gt_roi_label) #print('got region proposals') # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# n_bbox = bbox.shape if len(n_bbox) > 0: n_bbox = n_bbox[0] if n_bbox > 0: gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) #print(gt_rpn_label.shape) #print(gt_rpn_label) #print(anchor.shape) #print(sample_roi.shape) #print('got anchor targets') gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) #print(rpn_loc_loss) else: #if no bboxes, should have no rpn loc loss rpn_loc_loss = t.tensor(0.) if opt.use_cuda: rpn_loc_loss = rpn_loc_loss.cuda() #print('got rpn loc loss') # if no bboxes, all region labels are 0 (background) if n_bbox == 0: gt_rpn_label = t.tensor([0 for i in range(anchor.shape[0])]) # NOTE: default value of ignore_index is -100 ... fg_bg_count = np.unique(gt_rpn_label.detach().cpu(), return_counts=True)[1][1:] if opt.reduce_bg_weight: # Reweight foreground / background for the case we couldn't sample identical numbers rpn_class_weights = 1.0 / fg_bg_count rpn_class_weights = t.FloatTensor(rpn_class_weights / np.sum(rpn_class_weights) * 2) else: rpn_class_weights = None if opt.use_cuda: rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1, weight=rpn_class_weights.cuda() if rpn_class_weights is not None else None) else: rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1, weight=rpn_class_weights) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) #print('got rpn class loss') # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) #print(n_sample, gt_roi_label.shape, sample_roi.shape) if opt.use_cuda: roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()] else: roi_loc = roi_cls_loc[t.arange(0, n_sample).long(), at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) if n_bbox > 0: roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) else: #no roi loc loss if no gt bboxes roi_loc_loss = t.tensor(0.) if opt.use_cuda: roi_loc_loss = roi_loc_loss.cuda() #print('got roi loc loss') if opt.reduce_bg_weight: bg_weight = 1.0 / gt_roi_label.size()[0] class_weights = t.FloatTensor(np.hstack([bg_weight, np.ones((self.n_fg_class,))])) else: class_weights = None if opt.use_cuda: roi_cls_loss = nn.CrossEntropyLoss(weight=class_weights.cuda() if class_weights is not None else None)(roi_score, gt_roi_label.cuda()) else: roi_cls_loss = nn.CrossEntropyLoss(weight=class_weights)(roi_score, gt_roi_label) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) #print('got roi class loss') losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] #print(losses) sum_losses = sum(losses) #print(sum_losses.type) losses = losses + [sum_losses] return LossTuple(*losses)
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(features, img_size, scale) """ rpn_locs.shape, rpn_scores.shape, rois.shape, roi_indices.shape, anchor.shape = (torch.Size([1, 18648, 4]), torch.Size([1, 18648, 2]), (1714, 4), (1714,), (18648, 4)) rpn网络做的事情是: 对于每张图片,利用它的feature map, 计算 (H/16)× (W/16)×9(大概20000)个anchor属于前景或背景的概率(rpn_scores), 以及对应的网络预测的需要修正的位置参数(rpn_locs)。 然后,对于每张图片,根据前面算出来的前景的概率(rpn_fg_scores), 选取概率较大的12000个anchor, 利用回归的位置参数(rpn_locs),修正这12000个anchor的位置,得到RoIs 利用非极大值((Non-maximum suppression, NMS)抑制,选出概率最大的2000个RoIs 注意:在inference的时候,为了提高处理速度,12000和2000分别变为6000和300. """ # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois """ bbox.shape,label.shape,rpn_score.shape,rpn_loc.shape,roi.shape = (torch.Size([2, 4]), torch.Size([2]), torch.Size([16650, 2]), torch.Size([16650, 4]), (2000, 4)) """ # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), # at = array_tools,tensor to numpy 用不着了,在pytorch0.4里 at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) """ sample_roi.shape, gt_roi_loc.shape, gt_roi_label.shape = ((128, 4), (128, 4), (128,)) proposal_target_creator的作用是: RPN会产生大约2000个RoIs,这2000个RoIs不是都拿去训练, 而是利用ProposalTargetCreator 选择128个RoIs用以训练。选择的规则如下: RoIs和gt_bboxes 的IoU大于0.5的,选择一些(比如32个) 选择 RoIs和gt_bboxes的IoU小于0.5,同时大于等于0(或者0.1)的选择一些(比如 128-32=96个)作为负样本 为了便于训练,对选择出的128个RoIs,还对他们的gt_roi_loc 进行标准化处理(减去均值除以标准差) 最终输出128个roi框及其分别对应的需要修正的[ty,tx,th,tw]和label """ # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head( features, sample_roi, sample_roi_index) """ x.shape, rois.shape, roi_indices.shape = (torch.Size([1, 512, 37, 56]), (128, 4), torch.Size([128])) ROIHEAD做的事情是根据前面得到的128个roi框, 去feature上分别做roi pool, 得到[128,512,7,7]的最终信息 相当于每一个roi框,不管他有多大, 统统roi pool到[512,7,7] 再然后就是几个linear layer, 从512*7*7 = 25088 得到 21维的class score 和 84维的roi_cls_locs 最终输出是[128, 84],[128, 21] """ # ------------------ RPN losses -------------------# gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) """ 所以总的来说,AnchorTargetCreator做的事情是: 根据每一个预先设定的anchor和这张图片的gt_bbox去计算iou, 再用求得的iou来给每一个anchor打标签, 1是正样本,0是负样本,-1表示不关心,不参与后续计算 打标签是通过 正负样本之和应该是self.n_sample,比例是self.pos_ratio 打标签的依据是: 1. iou < 0.3的都算负样本 2. 对每一个gt_object,标记和它iou最高的的anchor为正样本 可能同时有多个anchor同时iou最高(相等) 3. 剩下的anchor里面,iou大于0.7的也算正样本 4. 还要平衡一下正负样本的数量和比例 它不但打标签,还会计算每一个anchor和它最匹配的gt_bbox的loc, 用于后续的bbox回归loss计算 最后,返回的是loc和label # ((16650,), (16650, 4)) """ gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) # gt_rpn_loc.shape, gt_rpn_label.shape : ((18648, 4), (18648,)) rpn_loc_loss = _fast_rcnn_loc_loss( rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # loss value # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) """ rpn_score.shape,gt_rpn_label.shape : (torch.Size([15318, 2]), torch.Size([15318])) ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. When :attr:`size_average` is ``True``, the loss is averaged over non-ignored targets. Default: -100 """ _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] # _gt_rpn_label.shape,_rpn_score.shape : (torch.Size([256]), (256, 2)) # self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] # 128 roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) #torch.Size([128, 84]) to torch.Size([128, 21, 4]) roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \ at.totensor(gt_roi_label).long()] # 21个class的loc,取对应的gt制定的那个,即gt_roi_label # torch.Size([128, 4]) gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss( roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) # self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)
def predict(self, imgs,sizes=None,visualize=False): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ self.eval() if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) features = self.faster_rcnn.extractor(imgs) # 20000->12000->2000 rois rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn( features, img_size, scale) # Since batch size is one, convert variables to singular form bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] rpn_loc = rpn_locs[0] roi = rois # ProposalTargetCreator # 在训练RoIHead/Fast R-CNN的时候,从 2000 个 rois 中选择 128 个用以训练。 sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now sample_roi_index = t.zeros(len(sample_roi)) roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) # ------------------ RPN losses -------------------# # AnchorTargetCreator # 训练RPN的时候,从20000个anchor中选择256个进行训练, # 以使得正负样本比例大概是1:1. 同时给出训练的 位置参数目标。 gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # NOTE: default value of ignore_index is -100 ... rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1) # ------------------ ROI losses (fast rcnn loss) -------------------# n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) roi_loc = roi_cls_loc[t.arange(0, n_sample).long(), at.totensor(gt_roi_label).long()] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] # 没有显示平均误差 print('rpnLoc:', float(rpn_loc_loss.data.numpy()), ' rpnCls:', float(rpn_cls_loss.data.numpy()), ' roiLoc:', float(rpn_loc_loss.data.numpy()), ' roiCls:', float(roi_cls_loss.data.numpy())) losses = losses + [sum(losses)] return LossTuple(*losses)
repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores def get_optimizer(self): """
def predict(self, imgs, sizes=None, visualize=False): """ Detect objects from images. This method predicts objects for each image. """ self.eval() self.use_preset('evaluate') if visualize: self.use_preset('visualize') prepared_imgs = list() sizes = list() for img in imgs: size = img.shape[1:] img = preprocess(at.tonumpy(img)) prepared_imgs.append(img) sizes.append(size) else: prepared_imgs = imgs bboxes = list() labels = list() scores = list() for img, size in zip(prepared_imgs, sizes): img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) scale = img.shape[3] / size[1] roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) # We are assuming that batch size is 1. roi_score = roi_scores.data roi_cls_loc = roi_cls_loc.data roi = at.totensor(rois) / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = t.Tensor(self.loc_normalize_mean).cuda(). \ repeat(self.n_class)[None] std = t.Tensor(self.loc_normalize_std).cuda(). \ repeat(self.n_class)[None] roi_cls_loc = (roi_cls_loc * std + mean) roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) cls_bbox = loc2bbox( at.tonumpy(roi).reshape((-1, 4)), at.tonumpy(roi_cls_loc).reshape((-1, 4))) cls_bbox = at.totensor(cls_bbox) cls_bbox = cls_bbox.view(-1, self.n_class * 4) # clip bounding box cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) raw_cls_bbox = at.tonumpy(cls_bbox) raw_prob = at.tonumpy(prob) bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(score) self.use_preset('evaluate') self.train() return bboxes, labels, scores
def forward(self, imgs, bboxes, labels, scale): """Forward Faster R-CNN and calculate losses. Here are notations used. * :math:`N` is the batch size. * :math:`R` is the number of bounding boxes per image. Currently, only :math:`N=1` is supported. Args: imgs (~torch.autograd.Variable): A variable with a batch of images. bboxes (~torch.autograd.Variable): A batch of bounding boxes. Its shape is :math:`(N, R, 4)`. labels (~torch.autograd..Variable): A batch of labels. Its shape is :math:`(N, R)`. The background is excluded from the definition, which means that the range of the value is :math:`[0, L - 1]`. :math:`L` is the number of foreground classes. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: namedtuple of 5 losses """ n = bboxes.shape[0] if n != 1: raise ValueError('Currently only batch size 1 is supported.') _, _, H, W = imgs.shape img_size = (H, W) # extractor在这里是VGG16的前10层,通过extractor可以提取feature_map features = self.faster_rcnn.extractor(imgs) # ------------------ RPN Network -------------------# # ------------------ RPN 预测 -------------------# # 通过RPN网络提取roi # rpn_locs:每个anchor的修正量,[1,9*hh*ww,4] # rpn_scores:每个anchor的二分类(是否为物体)得分,[1,9*hh*ww,2] # rois:通过rpn网络获得的ROI(候选区),训练时约2000个,[2000,4] # roi_indeces:不太懂,[0,0..0,0]?,长度和rois的个数一样,后面也根本没有用到 # -解答-:全0是因为只支持batch size=1,这个index相当于在batch里的索引 # rpn_locs和rpn_scores是用于训练时计算loss的,rois是给下面rcnn网络用来分类的 # 注意,这里对每个anchor都进行了位置和分类的预测,也就是对9*hh*ww个anchor都进行了预测 rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn( features, img_size, scale) # Since batch size is one, convert variables to singular form # 因为这里只支持BatchSize=1,所以直接提取出来 bbox = bboxes[0] label = labels[0] rpn_score = rpn_scores[0] # [n_anchor,2] rpn_loc = rpn_locs[0] # [n_anchor,4] roi = rois # ------------------ RPN 标注 -------------------# # 因为RPN网络对所有的(9*hh*ww)个anchor都进行了预测,所以这里的gt_rpn_loc, gt_rpn_label应该包含所有anchor的对应值 # 但是在真实计算中只采样了一定的正负样本共256个用于计算loss # 这里的做法:正样本label=1,负样本label=0,不合法和要忽略的样本label=-1,在计算loss时加权区分 gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( at.tonumpy(bbox), anchor, img_size) gt_rpn_label = at.tovariable(gt_rpn_label).long() gt_rpn_loc = at.tovariable(gt_rpn_loc) # ------------------ RPN losses 计算 -------------------# # loc loss(位置回归loss) # loc的loss只计算正样本的 rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label.data, self.rpn_sigma) # cls loss(分类loss,这里只分两类) # label=-1的样本被忽略 rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1] _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1] self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long()) # ------------------ ROI Nework -------------------# # ------------------ ROI 标注 -------------------# # Sample RoIs and forward # it's fine to break the computation graph of rois, # consider them as constant input # 在roi中采样一定数量的正负样本,给ROIHead(rcnn)网络用于训练分类 # gt_roi_loc:位置修正量,这里就是第二次对位置进行回归修正 # gt_roi_label:N+1类,多了一个背景类(是不是物体) sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( roi, at.tonumpy(bbox), at.tonumpy(label), self.loc_normalize_mean, self.loc_normalize_std) # NOTE it's all zero because now it only support for batch=1 now(这里解释了上面的疑问) sample_roi_index = t.zeros(len(sample_roi)) # ------------------ ROI 预测 -------------------# # 这里不需要对所有的ROI进行预测,所以在标注阶段确定了样本之后再进行预测 # 得到候选区域sample_roi的预测分类roi_score和预测位置修正量roi_cls_loc roi_cls_loc, roi_score = self.faster_rcnn.head(features, sample_roi, sample_roi_index) n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) # [n_sample, n_class+1, 4] # roi_cls_loc得到的是对每个类的坐标的预测,但是真正的loss计算只需要在ground truth上的类的位置预测 # roi_loc就是在ground truth上的类的位置预测 roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()] # [m_sample.4] gt_roi_label = at.tovariable(gt_roi_label).long() gt_roi_loc = at.tovariable(gt_roi_loc) # loc loss(位置回归loss) roi_loc_loss = _fast_rcnn_loc_loss(roi_loc.contiguous(), gt_roi_loc, gt_roi_label.data, self.roi_sigma) # cls loss(分类loss,这里分21类) roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long()) losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss] losses = losses + [sum(losses)] return LossTuple(*losses)