示例#1
0
    def forward(self, im_data, im_info, gt_boxes, num_boxes):
        batch_size = im_data.size(0)

        im_info = im_info.data
        gt_boxes = gt_boxes.data
        num_boxes = num_boxes.data

        # feed image data to base model to obtain base feature map
        base_feat = self.RCNN_base(im_data)

        # feed base feature map tp RPN to obtain rois
        rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(
            base_feat, im_info, gt_boxes, num_boxes)

        # if it is training phrase, then use ground trubut bboxes for refining
        if self.training:
            roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes)
            rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data

            rois_label = Variable(rois_label.view(-1).long())
            rois_target = Variable(rois_target.view(-1, rois_target.size(2)))
            rois_inside_ws = Variable(
                rois_inside_ws.view(-1, rois_inside_ws.size(2)))
            rois_outside_ws = Variable(
                rois_outside_ws.view(-1, rois_outside_ws.size(2)))
        else:
            rois_label = None
            rois_target = None
            rois_inside_ws = None
            rois_outside_ws = None
            rpn_loss_cls = 0
            rpn_loss_bbox = 0

        rois = Variable(rois)  # (batch_size/5L, rois_nums/128L, 5L)
        # do roi pooling based on predicted rois

        if cfg.POOLING_MODE == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            grid_xy = _affine_grid_gen(rois.view(-1, 5),
                                       base_feat.size()[2:], self.grid_size)
            grid_yx = torch.stack(
                [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]],
                3).contiguous()
            pooled_feat = self.RCNN_roi_crop(base_feat,
                                             Variable(grid_yx).detach())
            if cfg.CROP_RESIZE_WITH_MAX_POOL:
                pooled_feat = F.max_pool2d(pooled_feat, 2, 2)
        elif cfg.POOLING_MODE == 'align':
            pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5))
        elif cfg.POOLING_MODE == 'pool':
            pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5))

        # pooled_feat: (batch_size*rois_nums/640L, channels/1024L, pooled_height/7L, pooled_width/7L)
        # pooled_feat: (batch_size*rois_nums/640L, channels/2048L)

        # feed pooled features to top model
        pooled_feat = self._head_to_tail(pooled_feat)

        # compute bbox offset
        bbox_pred = self.RCNN_bbox_pred(pooled_feat)
        if self.training and not self.class_agnostic:
            # select the corresponding columns according to roi labels
            bbox_pred_view = bbox_pred.view(bbox_pred.size(0),
                                            int(bbox_pred.size(1) / 4), 4)
            bbox_pred_select = torch.gather(
                bbox_pred_view, 1,
                rois_label.view(rois_label.size(0), 1,
                                1).expand(rois_label.size(0), 1, 4))
            bbox_pred = bbox_pred_select.squeeze(1)

        # compute object classification probability
        cls_score = self.RCNN_cls_score(pooled_feat)
        cls_prob = F.softmax(cls_score)

        RCNN_loss_cls = 0
        RCNN_loss_bbox = 0

        if self.training:
            # classification loss
            RCNN_loss_cls = F.cross_entropy(cls_score, rois_label)

            # bounding box regression L1 loss
            RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target,
                                             rois_inside_ws, rois_outside_ws)

        cls_prob = cls_prob.view(batch_size, rois.size(1), -1)
        bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1)

        return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
示例#2
0
    def _PyramidRoI_Feat(self, feat_maps, rois, im_info):
        ''' roi pool on pyramid feature maps'''
        # do roi pooling based on predicted rois
        img_area = im_info[0][0] * im_info[0][1]
        h = rois.data[:, 4] - rois.data[:, 2] + 1
        w = rois.data[:, 3] - rois.data[:, 1] + 1
        roi_level = torch.log(torch.sqrt(h * w) / 224.0) / np.log(2)
        roi_level = torch.floor(roi_level + 4)
        # --------
        # roi_level = torch.log(torch.sqrt(h * w) / 224.0)
        # roi_level = torch.round(roi_level + 4)
        # ------
        roi_level[roi_level < 2] = 2
        roi_level[roi_level > 5] = 5
        # roi_level.fill_(5)
        if cfg.POOLING_MODE == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            # NOTE: need to add pyrmaid
            grid_xy = _affine_grid_gen(rois,
                                       feat_maps.size()[2:],
                                       self.grid_size)  ##
            grid_yx = torch.stack(
                [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]],
                3).contiguous()
            roi_pool_feat = self.RCNN_roi_crop(feat_maps,
                                               Variable(grid_yx).detach())  ##
            if cfg.CROP_RESIZE_WITH_MAX_POOL:
                roi_pool_feat = F.max_pool2d(roi_pool_feat, 2, 2)

        elif cfg.POOLING_MODE == 'align':
            roi_pool_feats = []
            box_to_levels = []
            for i, l in enumerate(range(2, 6)):
                if (roi_level == l).sum() == 0:
                    continue
                idx_l = (roi_level == l).nonzero().squeeze()
                box_to_levels.append(idx_l)
                scale = feat_maps[i].size(2) / im_info[0][0]
                feat = self.RCNN_roi_align(feat_maps[i], rois[idx_l], scale)
                roi_pool_feats.append(feat)
            roi_pool_feat = torch.cat(roi_pool_feats, 0)
            box_to_level = torch.cat(box_to_levels, 0)
            idx_sorted, order = torch.sort(box_to_level)
            roi_pool_feat = roi_pool_feat[order]

        elif cfg.POOLING_MODE == 'pool':
            roi_pool_feats = []
            box_to_levels = []
            for i, l in enumerate(range(2, 6)):
                if (roi_level == l).sum() == 0:
                    continue
                idx_l = (roi_level == l).nonzero().squeeze()
                box_to_levels.append(idx_l)
                scale = feat_maps[i].size(2) / im_info[0][0]
                feat = self.RCNN_roi_pool(feat_maps[i], rois[idx_l], scale)
                roi_pool_feats.append(feat)
            roi_pool_feat = torch.cat(roi_pool_feats, 0)
            box_to_level = torch.cat(box_to_levels, 0)
            idx_sorted, order = torch.sort(box_to_level)
            roi_pool_feat = roi_pool_feat[order]

        return roi_pool_feat
示例#3
0
    def forward(self, im_data, im_info, gt_boxes, num_boxes):
        '''

        :param im_data: shape=(b,3,W,H)
        :param im_info:shape=(b,3)  3=[W,H,2.2901]最后一个2.2901含义还不太清楚
        :param gt_boxes:shape=(b,20,5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据(固定的(b,20,5)维度) 前n为没张图片上的gt,后面20-n全为0
        :param num_boxes:shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt
        :return:
        '''
        batch_size = im_data.size(0)

        im_info = im_info.data
        gt_boxes = gt_boxes.data
        num_boxes = num_boxes.data

        # feed image data to base model to obtain base feature map
        base_feat = self.RCNN_base(im_data)
        #base_feat.shape=(b,1024,w,h)  w和h是原图的16分之一
        '''到此得到了前面特征提取网络的结果'''

        # feed base feature map tp RPN to obtain rois
        rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes)
        '''
        #rois=output.shape=(b,2000,5)  5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2]
        #rpn_loss_cls:rpn网络的分类loss   只计算了正样本和负样本的分类loss(根据匹配的标签,先得到正样本和负样本的索引,然后从预测的分数中取出
        #        正样本和负样本的分类分数,然后在从匹配的标签中取出正样本和负样本的标签(1,0),然后两者之间算交叉熵loss)
        #rpn_loss_box=(2.36)  是一个值,代表一个batch里面各个图片上的回归损失求的平均

        
        到此为止,rpn的功能就结束了,产生2000个proposal ,以及算所有anchor的回归损失和分类损失
        '''



        # if it is training phrase, then use ground trubut bboxes for refining
        if self.training:
            roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes)
            #rois.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2]
            #gt_boxes: shape =(b, 20, 5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据(固定的(b, 20, 5)维度)
            #num_boxes:shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt





            rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data
            '''
            rois.shape=(b, 128, 5) 记录了这128个roi的box(是从rpn预测出来的2020个proposal里面选出的128个box) 5:第一个数i代表该图片是该batch中的第i张图片
            rois_label=labels.shape=(b, 128) 记录了每张图片上128个样本(正、负)的类别target(就是与其iou最大的gt的类别,精确到哪一类,而不是前背景类)
            rois_target=bbox_targets.shape=(b,128,4) 存了正样本和负样本的回归target(负样本的回归目标在 self._get_bbox_regression_labels_pytorch函数中被设置成0了)
            rois_inside_ws=box_inside_weights.shape=(b,128,4) 4=[1,1,1,1]或者[0,0,0,0] 正样本的权重是[1,1,1,1] 负样本的权重是[0,0,0,0]
            rois_outside_ws=bbox_outside_weights.shape=(b,128,4) 4=[1,1,1,1]或者[0,0,0,0] 正样本的权重是[1,1,1,1] 负样本的权重是[0,0,0,0]
            
            '''

            rois_label = Variable(rois_label.view(-1).long()) #shape=(b*128) 保存了送入rcnn网络的每张图片128个roi的类别标签
            rois_target = Variable(rois_target.view(-1, rois_target.size(2)))#shape=(b*128,4)保存了送入rcnn网络的每张图片128个roi的回归target
            rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))#shape=(b*128,4) 保存了每张图片128个roi的内权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0]
            rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))#shape=[b*128,4]保存了每张图片128个roi的外权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0]
        else:
            rois_label = None
            rois_target = None
            rois_inside_ws = None
            rois_outside_ws = None
            rpn_loss_cls = 0
            rpn_loss_bbox = 0

        rois = Variable(rois)
        #train : rois.shape=(b, 128, 5) 记录了这128个roi的box(是从rpn预测出来的2020个proposal里面选出的128个box) 5:第一个数i代表该图片是该batch中的第i张图片
        #test : rois.shape=(b,2000,5)  5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2]


        # do roi pooling based on predicted rois
        '''
        这里从特征图上获得roi box的特征使用的方式是:align(虽然cfg.POOLING_MODE = ’crop‘,但是前面应该是哪里把它改成了align )
        '''
        if cfg.POOLING_MODE == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size)
            # base_feat.shape=(b,1024 ,w,h)  w和h是原图的16分之一    base_feat.size()[2:] = [w,h]


            grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous()
            pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach())
            if cfg.CROP_RESIZE_WITH_MAX_POOL:
                pooled_feat = F.max_pool2d(pooled_feat, 2, 2)
        elif cfg.POOLING_MODE == 'align':#运行align
            #base_feat.shape=(b,1024 ,w,h)  w和h是原图的16分之一
            #train : rois.view(-1,5) .shape=(b*128,5)   test : rois.view(-1,5) .shape=(b*2000,5)
            pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5))
            #train : pooled_feat.shape=(b*128,1024,7,7)  test : pooled_feat.shape=(b*2000,1024,7,7)
        elif cfg.POOLING_MODE == 'pool':
            pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5))

        # feed pooled features to top model
        '''这里只看训练时候:train : pooled_feat.shape=(b*128,1024,7,7)'''
        pooled_feat = self._head_to_tail(pooled_feat)#这里父类(class faster rcnn)调用了子类(class VGG)的head函数
        ##pooled_feat.shape=(b*128,2048)

        # compute bbox offset
        bbox_pred = self.RCNN_bbox_pred(pooled_feat)#bbox_pred.shape=(b*128,21*4)
        #看了训练的时候的输出:self.class_agnostic=false 那么就是需要针对每个类别预测回归值
        if self.training and not self.class_agnostic: #self.class_agnostic=false
            # select the corresponding columns according to roi labels
            bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)#bbox_pred_view.shape=(b*128,21,4)
            #rois_labels.shape=(b*128) 保存了送入rcnn网络的每张图片128个roi的类别标签
            bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4))
            #bbox_pred_select.shape=(b*128,1,4) 从针对每一类预测的回归值(21,4)中取出该roi匹配的gt的类别(也就是该roi的类别target)的回归值
            bbox_pred = bbox_pred_select.squeeze(1)
            #bbox_pred.shape=(b*128,4)

        '''
        train: bbox_pred.shape=(b*128,4)
        test:  bbox_pred.shape=(b*2000, 21*4)
        '''

        '''到此位置得到的 128个roi经过rcnn预测的回归值(针对target类别的回归值,也就是说,如果这个roi的类别target是汽车,那么就从21*4中取出关于汽车这个类的预测回归值)'''


        # compute object classification probability

        ##pooled_feat.shape=(b*128,2048)
        cls_score = self.RCNN_cls_score(pooled_feat)#shape=(b*128,21) 预测属于每个类的分数
        cls_prob = F.softmax(cls_score, 1)#shape=(b*128,21) 得到属于每个类的概率(预测)

        RCNN_loss_cls = 0
        RCNN_loss_bbox = 0

        if self.training:
            # classification loss
            RCNN_loss_cls = F.cross_entropy(cls_score, rois_label)

            # bounding box regression L1 loss
            RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws)
            ##bbox_pred.shape=(b*128,4) 预测的每个roi的的回归值(针对他们的类别target的回归值)
            ##rois_target.shape=(b*128,4)保存了送入rcnn网络的每张图片128个roi的回归target
            #rois_inside_ws.shape=(b*128,4) 保存了每张图片128个roi的内权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0]
            #rois_outside_ws.shape=[b*128,4]保存了每张图片128个roi的外权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0]


        cls_prob = cls_prob.view(batch_size, rois.size(1), -1)#shape(b,128,21)得到属于每个类的概率(预测) 如果是test:shape=(b,2000,21)
        bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1)#shape=(b,128,4)预测的每个roi的的回归值(针对他们的类别target的回归值) 如果是test:shape=(b,2000,4)
        #test: bbox_pred.shaep=(b,2000,21*4)

        return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
    def forward(self, x, targets):
        """Applies network layers and ops on input image(s) x.

        Args:
            x: input image or batch of images. Shape: [batch,3,300,300].

        Return:
            Depending on phase:
            test:
                Variable(tensor) of output class label predictions,
                confidence score, and corresponding location predictions for
                each object detected. Shape: [batch,topk,7]

            train:
                list of concat outputs from:
                    1: confidence layers, Shape: [batch*num_priors,num_classes]
                    2: localization layers, Shape: [batch,num_priors*4]
                    3: priorbox layers, Shape: [2,num_priors*4]
        """
        sources = list()
        loc = list()
        conf = list()
        batch_size = x.data.size(0)
        print('input image size: ', x.size())
        display_img = x[0].clone().cpu().numpy().transpose((1, 2, 0))
        print('display_img size: ', display_img.shape)

        # apply vgg up to conv4_3 relu
        for k in range(23):
            x = self.vgg[k](x)

        s = self.L2Norm(x)
        sources.append(s)

        # apply vgg up to fc7
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        sources.append(x)

        # apply extra layers and cache source layer outputs
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1:
                sources.append(x)

        # apply multibox head to source layers
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        roi_feat = sources[1]
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        if self.phase == "vid_train":
            """
            output = self.detect(
                loc.view(loc.size(0), -1, 4),                   # loc preds
                self.softmax(conf.view(conf.size(0), -1, self.num_classes)),                # conf preds
                #conf.view(conf.size(0), -1, self.num_classes),
                self.priors.type(type(x.data)),                 # default boxes
                targets
            )
            """
            output = self.detect(
                loc.view(loc.size(0), -1, 4),  # loc preds
                self.softmax(conf.view(conf.size(0), -1,
                                       self.num_classes)),  # conf preds
                #conf.view(conf.size(0), -1, self.num_classes),
                self.priors.type(type(x.data)),  # default boxes
            )
        else:
            output = (loc.view(loc.size(0), -1,
                               4), conf.view(conf.size(0), -1,
                                             self.num_classes), self.priors)
            # print('output size;', output.size())
            # print('output value: ', output[0], output[1])
            return output  # 临时返回

        #rois, loc, conf, loc_t, conf_t = output  # rois size: batchsize, top_k, 5
        rois, loc, conf, priors = output
        # print('after transform conf_t: ',conf_t)

        #   print('display rois values: \n',rois[0,0:4,:])
        img_scale = torch.Tensor([
            self.img_shape[1], self.img_shape[0], self.img_shape[1],
            self.img_shape[0]
        ])

        #bboxes = rois[0,:,1:].clone()*img_scale
        #  print('display scaled rois values: \n',bboxes[0:4,:])
        #  ax1 = None
        #  ax2 = None
        #  plot_image(display_img, ax=ax1, reverse_rgb=False)
        #  plot_bbox(display_img, bboxes[:5,:].cpu(), ax=ax2)
        #  plt.show()
        rois[:, :, 1:] = rois[:, :, 1:] * img_scale
        # print('display scaled rois values: \n',rois.size(),rois[:,0:4,:])
        if self.cfg['POOLING_MODE'] == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            grid_xy = _affine_grid_gen(rois.view(-1, 5),
                                       roi_feat.size()[2:], self.grid_size)
            grid_yx = torch.stack(
                [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]],
                3).contiguous()
            pooled_feat = self.roi_crop(roi_feat, Variable(grid_yx).detach())
            if self.cfg['CROP_RESIZE_WITH_MAX_POOL']:
                pooled_feat = F.max_pool2d(pooled_feat, 2, 2)
        elif self.cfg['POOLING_MODE'] == 'align':
            pooled_feat = self.roi_align(roi_feat, rois.view(-1, 5))
        elif self.cfg['POOLING_MODE'] == 'pool':
            pooled_feat = self.roi_pool(roi_feat, rois.view(-1, 5))

    #  print('after roi feature size: ',pooled_feat.size())
        Con1_1 = nn.Conv2d(1024, 1, kernel_size=1, padding=0, dilation=1)
        pooled_feat = Con1_1(pooled_feat)
        #  print('after conv1_1 feature size: ',pooled_feat.size())
        scale = L2Norm(1, 20)
        normlize_feat = scale(pooled_feat)
        #  print('normlize feature size: ', normlize_feat.size())
        feat = normlize_feat.view(normlize_feat.size(0), normlize_feat.size(1),
                                  -1)
        #feat = pooled_feat.view(pooled_feat.size(0), pooled_feat.size(1), -1)
        #  print('after reshape feat size: ',feat.size())
        feat = feat.squeeze().view(batch_size, feat.size(0) / batch_size, -1)
        #  print('after slice feat size: ',feat.size())

        #   print('loc size: ',loc.size(), '\nconf size: ',conf.size())
        stacked_tensor = torch.cat((conf, loc, feat), -1)
        #  print('stacked_tensor size: ',stacked_tensor.size(),'\n',stacked_tensor[:,:2,:])
        o1, _ = self.bnlstm1(stacked_tensor)
        print('output1 size: ', o1.size())
        #print('hidden1 size: ',len(h1),h1[0].size(),h1[1].size())
        o2, _ = self.bnlstm2(o1)
        print('output2 size: ', o2.size())
        #print('hidden2 size: ',len(h2),h2[0].size(),h2[1].size())
        cls_pred = self.cls_pred(o2)
        print('cls_pred size: ', cls_pred.size())
        bbox_pred = self.bbox_pred(o2)
        print('bbox_pred size: ', bbox_pred.size())
        association_pred = self.association_pred(o2)
        print('association_pred size: ', association_pred.size())
        #loc_t, conf_t
        #print('loc_t size: ', loc_t.size())
        # print('conf_t size: ', conf_t.size())
        print('conf size: ', conf.size())
        # loc_loss, cls_loss = self.MultiProjectLoss(cls_pred, bbox_pred, association_pred, loc_t, conf_t)
        ## print('loc_loss size: ',loc_loss.size())
        ## print('cls_loss size: ',cls_loss.size())
        #   pooled_feat = pooled_feat.view(pooled_feat.size(0), pooled_feat.size(1), -1)
        print('output priors size: ', priors.size())
        return bbox_pred, cls_pred, self.priors
示例#5
0
    def ext_feat(self, im_data, im_info, gt_boxes, num_boxes, use_rpn=True):
        batch_size = im_data.size(0)

        im_info = im_info.data
        gt_boxes = gt_boxes.data
        num_boxes = num_boxes.data

        # feed image data to base model to obtain base feature map
        base_feat = self.RCNN_base(im_data)

        if use_rpn:
            # feed base feature map tp RPN to obtain rois
            rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(
                base_feat, im_info, gt_boxes, num_boxes)

            # if it is training phrase, then use ground trubut bboxes for refining
            if self.training:
                roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes)
                rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data

                rois_label = Variable(rois_label.view(-1).long())
                rois_target = Variable(
                    rois_target.view(-1, rois_target.size(2)))
                rois_inside_ws = Variable(
                    rois_inside_ws.view(-1, rois_inside_ws.size(2)))
                rois_outside_ws = Variable(
                    rois_outside_ws.view(-1, rois_outside_ws.size(2)))
            else:
                rois_label = None
                rois_target = None
                rois_inside_ws = None
                rois_outside_ws = None
                rpn_loss_cls = 0
                rpn_loss_bbox = 0

            rois = Variable(rois)
            # do roi pooling based on predicted rois
        else:
            rois_label = None
            rois_target = None
            rois_inside_ws = None
            rois_outside_ws = None
            rpn_loss_cls = 0
            rpn_loss_bbox = 0
            raw_rois = torch.zeros(gt_boxes.size())
            raw_rois[0, :, 1:] = gt_boxes[0, :, :4]
            rois = Variable(raw_rois).cuda()

        if cfg.POOLING_MODE == 'crop':
            # pdb.set_trace()
            # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5))
            grid_xy = _affine_grid_gen(rois.view(-1, 5),
                                       base_feat.size()[2:], self.grid_size)
            grid_yx = torch.stack(
                [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]],
                3).contiguous()
            pooled_feat = self.RCNN_roi_crop(base_feat,
                                             Variable(grid_yx).detach())
            if cfg.CROP_RESIZE_WITH_MAX_POOL:
                pooled_feat = F.max_pool2d(pooled_feat, 2, 2)
        elif cfg.POOLING_MODE == 'align':
            pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5))
        elif cfg.POOLING_MODE == 'pool':
            pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5))

        # feed pooled features to top model
        fc7 = self._head_to_tail(pooled_feat)

        return fc7