def test_fast_rcnn_empty_batch(self): box_predictor = FastRCNNOutputLayers( ShapeSpec(channels=10), Box2BoxTransform(weights=(10, 10, 5, 5)), 8) logits = torch.randn(0, 100, requires_grad=True) deltas = torch.randn(0, 4, requires_grad=True) losses = box_predictor.losses([logits, deltas], []) for value in losses.values(): self.assertTrue(torch.allclose(value, torch.zeros_like(value))) sum(losses.values()).backward() self.assertTrue(logits.grad is not None) self.assertTrue(deltas.grad is not None) predictions, _ = box_predictor.inference([logits, deltas], []) self.assertEqual(len(predictions), 0)
class RelationROIHeads(Res5ROIHeads): def __init__(self, cfg, input_shape): """ Args: num_ralation (int): the number of relation modules used. Each with seperate parameters """ super().__init__(cfg, input_shape) ############################### parameters ################################# if self.training: self.pre_nms_dim = cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN else: self.pre_nms_dim = cfg.MODEL.RPN.PRE_NMS_TOPK_TEST self.num_relation = cfg.MODEL.RELATIONNET.NUM_RELATION self.pos_emb_dim = cfg.MODEL.RELATIONNET.POS_EMB_DIM self.feat_dim = cfg.MODEL.RELATIONNET.FEAT_DIM self.att_fc_dim = cfg.MODEL.RELATIONNET.ATT_FC_DIM self.att_groups = cfg.MODEL.RELATIONNET.ATT_GROUPS self.att_dim = cfg.MODEL.RELATIONNET.ATT_DIM self.pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION self.num_reg_classes = self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES if cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG: self.num_reg_classes = 2 self.device = torch.device(cfg.MODEL.DEVICE) self.learn_nms_train = cfg.MODEL.RELATIONNET.LEARN_NMS_TRAIN self.learn_nms_test = cfg.MODEL.RELATIONNET.LEARN_NMS_TEST self.first_n = cfg.MODEL.RELATIONNET.FIRST_N_TEST self.num_boxes = self.batch_size_per_image if self.training: self.first_n = cfg.MODEL.RELATIONNET.FIRST_N_TRAIN ############################### modules #################################### self.res5, self.res5_out_channels = self._build_res5_block(cfg) self.box_predictor = FastRCNNOutputLayers( cfg, ShapeSpec(channels=self.res5_out_channels, height=1, width=1)) self.fc_feat = nn.Linear(self.res5_out_channels, self.feat_dim).to(self.device) self.fc = [ nn.Linear(self.feat_dim, self.feat_dim).to(self.device) for i in range(2) ] self.nms_module = LearnNMSModule(cfg) ########################## freeze parameters ############################### # for block in self.res5: # block.freeze() # for p in self.box_predictor.parameters(): # p.requires_grad = False ############################# intialization ################################ mean, std = 0.0, 0.01 nn.init.normal_(self.fc_feat.weight, mean, std) nn.init.constant_(self.fc_feat.bias, mean) for i in range(2): nn.init.normal_(self.fc[i].weight, mean, std) nn.init.constant_(self.fc[i].bias, mean) def _build_attention_module_multi_head(self): attention_module_multi_head = AttentionModule( self.att_fc_dim, self.pos_emb_dim, self.feat_dim, self.att_dim, self.att_groups, self.num_reg_classes, self.device) return attention_module_multi_head @torch.no_grad() def label_proposals(self, proposals, targets): proposals_with_gt = [] self.num_boxes = np.min([len(x.proposal_boxes) for x in proposals]) for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 _, indices = torch.sort(proposals_per_image.objectness_logits, descending=True) sampled_idxs = indices[:self.num_boxes] proposals_per_image = proposals_per_image[sampled_idxs] match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) gt_classes = self._label_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes) proposals_per_image.gt_classes = gt_classes if has_gt: for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[matched_idxs]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes proposals_with_gt.append(proposals_per_image) return proposals_with_gt def _label_proposals(self, matched_idxs, matched_labels, gt_classes): has_gt = gt_classes.numel() > 0 if has_gt: gt_classes = gt_classes[matched_idxs] gt_classes[matched_labels <= 0] = self.num_classes gt_classes[matched_labels == -1] = -1 else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes return gt_classes def forward(self, images, features, proposals, targets=None): """ Args: images (ImageList) features (dict[str,Tensor]): key: str like ["p2", "p3", "p4", "p5"] or ["res4"] value: Tensor.shape = (N, C, H, W) proposals (list[Instances]): Each Instances contains bboxes/masks/keypoints of a image. We focus on - proposal_boxes: proposed bboxes in format `Boxes` - objectness_logits: list[np.ndarray] each is an N sized array of objectness scores corresponding to the boxes targets (list[Instances], optional): length `N` list of `Instances`. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. It may have the following fields: - gt_boxes: the bounding box of each instance. - gt_classes: the label for each instance with a category ranging in [0, #class]. Returns: pred_instances (list[Instances]): length `N` list of `Instances` containing the detected instances. Returned during inference only; may be [] during training. loss (dict[str->Tensor]): mapping from a named loss to a tensor storing the loss. Used during training only. """ # TODO: index the nms_multi_target to get the corresponding "first_n" # complete the binary cross_entropy loss del images if self.training: assert targets proposals = self.label_proposals(proposals, targets) # proposal_boxes: List[Boxes] proposal_boxes = [x.proposal_boxes for x in proposals] # (all_valid_boxes, channels, outshape1, outshape2) box_features = self._shared_roi_transform( [features[f] for f in self.in_features], proposal_boxes) # (all_valid_boxes, channels * outshape1 * outshape2) # box_features = box_features.view(box_features.shape[0], -1) ################################ 2fc+RM Head ############################### # Input: # box_features (Tensor): (batch_images*num_boxes, channels, outshape1, outshape2) # proposal_boxes (List[Boxes]): has batch_images instances # Output: # rois: # cls_prob: # bbox_pred: # TODO: add ground truth boxes in query fc_out = self.fc_feat(box_features.mean(dim=[2, 3])) ############################### learn nms ################################## # # Input is a set of detected objects: # Each object has its final 1024-d feature, classification score s0 and bounding boxes. # # The network has three steps. # 1. The 1024-d feature and classification score is fused to generate the appearance feature. # 2. A relation module transforms such appearance features of all objects. # 3. The transformed features of each object pass a linear classifier and sigmoid to output # the probabilit y ∈ [0, 1]. # predictions: (cls_score, bbox_pred) # - scores (Tensor): (all_valid_boxes, num_classes + 1), [0, num_classes] # => num_classes indicates backgroud # - proposal_deltas (Tensor): (all_valid_boxes, num_reg_classes * 4) predictions = self.box_predictor(box_features.mean(dim=[2, 3])) # do not use learn_nms if self.training and (not self.learn_nms_train): raise NoImplementationError( "training should set learn_nms == True!") elif (not self.training) and (not self.learn_nms_test): pred_instances, _ = self.box_predictor.inference( predictions, proposals) pred_instances = self.forward_with_given_boxes( features, pred_instances) return pred_instances, {} # nms_multi_score: (batch_images, first_n, num_classes, num_thresh) # sorted_boxes: (batch_images, first_n, num_classes, 4) # sorted_score: (batch_images, first_n, num_classes) nms_multi_score, sorted_boxes, sorted_score = self.nms_module( fc_out, predictions, proposal_boxes, self.num_boxes) # (batch_images, first_n, num_classes, num_thresh) nms_multi_target = self.nms_module.get_multi_target( sorted_boxes, targets, sorted_score) nms_multi_target = nms_multi_target.detach() del targets ############################# construct losses ################################ if self.training: del features losses = self.box_predictor.losses(predictions, proposals) losses["loss_relation"] = self.nms_module.nms_relation_loss( nms_multi_score, nms_multi_target) return [], losses else: pred_instances = self.nms_module.relationnet_inference( sorted_boxes, nms_multi_score, nms_multi_target, image_shapes=[x.image_size for x in proposals], ) pred_instances = self.forward_with_given_boxes( features, pred_instances) return pred_instances, {}
class GraphConnection(nn.Module): def __init__( self, cfg, input_shape, ): super(GraphConnection, self).__init__() self.cfg = cfg.clone() self.graph_channel = cfg.GRAPH.CHANNEL self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE self.heads = cfg.GRAPH.HEADS self.stuff_out_channel = cfg.GRAPH.STUFF_OUT_CHANNEL self.loss_weight_stuff = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES self.region_in_proj = nn.Linear(cfg.MODEL.ROI_BOX_HEAD.FC_DIM, self.graph_channel) self.stuff_in_proj = nn.Linear(cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM * 4, self.graph_channel) weight_init.c2_xavier_fill(self.region_in_proj) weight_init.c2_xavier_fill(self.stuff_in_proj) self.graph = GAT(nfeat=self.graph_channel, nhid=self.graph_channel // self.heads, nclass=self.graph_channel, dropout=0.1, alpha=0.4, nheads=self.heads) '''New box head''' self.region_out_proj = nn.Linear(self.graph_channel, self.graph_channel) weight_init.c2_xavier_fill(self.region_out_proj) # in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES # pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION # box_head = build_box_head( # cfg, ShapeSpec(channels=256, height=pooler_resolution, width=pooler_resolution) # ) # TODO: hard code in the channels # print(box_head.output_shape) box_output_shape = ShapeSpec(channels=cfg.MODEL.ROI_BOX_HEAD.FC_DIM + self.graph_channel) self.new_box_predictor = FastRCNNOutputLayers(cfg, box_output_shape) '''New mask head''' ret_dict = self._init_mask_head(cfg, input_shape) self.mask_in_features = ret_dict["mask_in_features"] self.new_mask_pooler = ret_dict["mask_pooler"] self.new_mask_head = ret_dict["mask_head"] # weight_init.c2_xavier_fill(self.new_mask_head) '''New segment head''' self.stuff_out_proj = nn.Linear(self.graph_channel, self.stuff_out_channel) self.seg_score = nn.Conv2d( cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM * 4 + self.stuff_out_channel, cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 1) self.upsample_rate = 4 weight_init.c2_xavier_fill(self.stuff_out_proj) weight_init.c2_xavier_fill(self.seg_score) @classmethod def _init_mask_head(cls, cfg, input_shape): if not cfg.MODEL.MASK_ON: return {} # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE # fmt: on in_channels = [input_shape[f].channels for f in in_features][0] ret = {"mask_in_features": in_features} ret["mask_pooler"] = (ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) if pooler_type else None) if pooler_type: shape = ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution) else: shape = {f: input_shape[f] for f in in_features} ret["mask_head"] = build_mask_head(cfg, shape) return ret # def _forward_mask(self, features: Dict[str, torch.Tensor], instances: List[Instances]): # """ # Forward logic of the mask prediction branch. # # Args: # features (dict[str, Tensor]): mapping from feature map names to tensor. # Same as in :meth:`ROIHeads.forward`. # instances (list[Instances]): the per-image instances to train/predict masks. # In training, they can be the proposals. # In inference, they can be the boxes predicted by R-CNN box head. # # Returns: # In training, a dict of losses. # In inference, update `instances` with new fields "pred_masks" and return it. # """ # if not self.mask_on: # return {} if self.training else instances # # if self.training: # # head is only trained on positive proposals. # instances, _ = select_foreground_proposals(instances, self.num_classes) # # if self.mask_pooler is not None: # features = [features[f] for f in self.mask_in_features] # boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances] # features = self.new_mask_pooler(features, boxes) # else: # features = {f: features[f] for f in self.mask_in_features} # return self.new_mask_head(features, instances) def forward( self, region_f, proposals, features, stuff_f, semseg_score, semseg_targets, images=None, seg_result=None, c2d=None, ori_sizes=None, img_ids=None, ): ''' Args: region_f: region features proposals: predicted proposals, containing the gt features: fpn features stuff_f: stuff features semseg_score: predicted sematnic scores semseg_targets: semantic segmentation gt images: original images seg_result: c2d: ori_sizes: img_ids: Returns: ''' assert len(proposals) == len(stuff_f) bs, _, h, w = semseg_score.shape proposals_num = [len(p) for p in proposals] assert sum(proposals_num) == len(region_f) region_nodes = self.region_in_proj(region_f) class_center = torch.matmul( F.softmax(semseg_score.flatten(start_dim=2), dim=-1), # softmax along hw stuff_f.flatten(start_dim=2).transpose(1, 2)) # bs x cls x 512 class_nodes = self.stuff_in_proj(class_center) region_nodes_split = region_nodes.split(proposals_num) new_region_nodes, new_class_nodes = [], [] for i in range(bs): region_node_per_img = region_nodes_split[i] stuff_node_per_img = class_nodes[i] nodes_num = len(region_node_per_img) + len(stuff_node_per_img) adj = torch.ones( nodes_num, nodes_num).cuda().detach() # fully connected graph graph_nodes = self.graph( torch.cat([region_node_per_img, stuff_node_per_img]), adj) new_region_f_per_img, new_stuff_f_per_img = graph_nodes.split( [len(region_node_per_img), len(stuff_node_per_img)]) new_region_nodes.append(new_region_f_per_img) new_class_nodes.append(new_stuff_f_per_img) new_region_f = torch.cat( [region_f, self.region_out_proj(torch.cat(new_region_nodes))], dim=-1) new_prediction = self.new_box_predictor(new_region_f) # box post-process if self.training: losses_box = self.new_box_predictor.losses(new_prediction, proposals) # losses_mask instances, _ = select_foreground_proposals(proposals, self.num_classes) features = [features[f] for f in self.mask_in_features] boxes = [ x.proposal_boxes if self.training else x.pred_boxes for x in instances ] features = self.new_mask_pooler(features, boxes) losses_mask = self.new_mask_head(features, instances) else: # testing # box pred_instances, _ = self.new_box_predictor.inference( new_prediction, proposals) # mask assert pred_instances[0].has( "pred_boxes") and pred_instances[0].has("pred_classes") features = [features[f] for f in self.mask_in_features] boxes = [ x.proposal_boxes if self.training else x.pred_boxes for x in pred_instances ] features = self.new_mask_pooler(features, boxes) instances = self.new_mask_head(features, pred_instances) # stuff new_class_nodes = self.stuff_out_proj(torch.stack(new_class_nodes)) new_stuff_f = torch.matmul( F.softmax(semseg_score.flatten(start_dim=2), dim=1).permute(0, 2, 1), # softmax along nodes new_class_nodes).permute(0, 2, 1).view(bs, self.stuff_out_channel, h, w) semseg_score = self.seg_score(torch.cat([stuff_f, new_stuff_f], dim=1)) semseg_score = semseg_score.float() segments = F.interpolate(semseg_score, None, self.upsample_rate, mode='bilinear', align_corners=False) # del semseg_score if self.training: loss = F.cross_entropy(segments, semseg_targets, reduction="mean", ignore_index=self.ignore_value) losses_sem = {"new_loss_sem_seg": loss * self.loss_weight_stuff} # update loss weight name # pdb.set_trace() losses_box.update(losses_mask) losses_box.update(losses_sem) key_list = list(losses_box.keys()) for key in key_list: if 'new' not in key: losses_box["new_" + key] = losses_box.pop(key) return None, None, losses_box else: return instances, segments, None ''' ############################### '''
class AttributeStandardROIHeads(AttributeROIHeads, StandardROIHeads): """ An extension of StandardROIHeads to include attribute prediction. """ def __init__(self, cfg, input_shape): super(StandardROIHeads, self).__init__(cfg, input_shape) self._init_box_head(cfg, input_shape) self._init_mask_head(cfg, input_shape) self._init_keypoint_head(cfg, input_shape) def _init_box_head(self, cfg, input_shape): # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE self.train_on_pred_boxes = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES self.attribute_on = cfg.MODEL.ATTRIBUTE_ON # fmt: on in_channels = [input_shape[f].channels for f in self.in_features] assert len(set(in_channels)) == 1, in_channels in_channels = in_channels[0] self.box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.box_head = build_box_head( cfg, ShapeSpec( channels=in_channels, height=pooler_resolution, width=pooler_resolution ), ) self.box_predictor = FastRCNNOutputLayers(cfg, self.box_head.output_shape) if self.attribute_on: self.attribute_predictor = AttributePredictor( cfg, self.box_head.output_shape.channels ) def _forward_box(self, features, proposals): features = [features[f] for f in self.in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) box_features, _ = self.box_head(box_features) predictions = self.box_predictor(box_features) if self.training: if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( predictions, proposals ) for proposals_per_image, pred_boxes_per_image in zip( proposals, pred_boxes ): proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) losses = self.box_predictor.losses(predictions, proposals) if self.attribute_on: losses.update(self.forward_attribute_loss(proposals, box_features)) del box_features return losses else: pred_instances, r_indices = self.box_predictor.inference( predictions, proposals ) return pred_instances[0], r_indices[0] def get_conv5_features(self, features): assert len(self.in_features) == 1 features = [features[f] for f in self.in_features] return features[0] def get_roi_features(self, features, proposals): features = [features[f] for f in self.in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) fc7, fc6 = self.box_head(box_features) return box_features, fc7, fc6
class AttributeRes5ROIHeads(AttributeROIHeads, Res5ROIHeads): """ An extension of Res5ROIHeads to include attribute prediction. """ def __init__(self, cfg, input_shape): # super(Res5ROIHeads, self).__init__(cfg, input_shape) # d2 0.1.1 super(Res5ROIHeads, self).__init__(cfg) # d2 0.2.1 # added to fit d2 0.2.1 self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES assert len(self.in_features) == 1 # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE pooler_scales = (1.0 / input_shape[self.in_features[0]].stride, ) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO self.mask_on = cfg.MODEL.MASK_ON self.attribute_on = cfg.MODEL.BUA.ATTRIBUTE_ON self.extract_on = cfg.MODEL.BUA.EXTRACT_FEATS self.extractor_mode = cfg.MODEL.BUA.EXTRACTOR.MODE # fmt: on assert not cfg.MODEL.KEYPOINT_ON self.pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.box2box_transform = BUABox2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) self.res5, out_channels = self._build_res5_block(cfg) self.box_predictor = FastRCNNOutputLayers( cfg, ShapeSpec(channels=out_channels, height=1, width=1) ) if self.mask_on: self.mask_head = build_mask_head( cfg, ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), ) if self.attribute_on: self.attribute_predictor = AttributePredictor(cfg, out_channels) def forward(self, images, features, proposals, targets=None): del images if self.training: assert targets proposals = self.label_and_sample_proposals(proposals, targets) del targets proposal_boxes = [x.proposal_boxes for x in proposals] box_features = self._shared_roi_transform( [features[f] for f in self.in_features], proposal_boxes ) feature_pooled = box_features.mean(dim=[2, 3]) predictions = self.box_predictor(feature_pooled) if self.training: del features losses = self.box_predictor.losses(predictions, proposals) if self.mask_on: proposals, fg_selection_masks = select_foreground_proposals( proposals, self.num_classes ) mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] del box_features losses.update(self.mask_head(mask_features, proposals)) if self.attribute_on: losses.update(self.forward_attribute_loss(proposals, feature_pooled)) return [], losses elif self.extract_on: pred_class_logits, pred_proposal_deltas = predictions # pred_class_logits = pred_class_logits[:, :-1] # background is last cls_lables = torch.argmax(pred_class_logits, dim=1) num_preds_per_image = [len(p) for p in proposals] if self.extractor_mode == 1 or self.extractor_mode == 3: if self.attribute_on: attr_scores = self.forward_attribute_score(feature_pooled, cls_lables) return proposal_boxes, self.predict_probs(pred_class_logits, num_preds_per_image), feature_pooled.split(num_preds_per_image, dim=0), attr_scores.split(num_preds_per_image, dim=0) else: return proposal_boxes, self.predict_probs(pred_class_logits, num_preds_per_image), feature_pooled.split(num_preds_per_image, dim=0) elif self.extractor_mode == 2: return self.predict_boxes(proposals, pred_proposal_deltas, num_preds_per_image), self.predict_probs(pred_class_logits, num_preds_per_image) else: raise ValueError('BUA.EXTRATOR.MODE ERROR') else: pred_instances, _ = self.box_predictor.inference(predictions, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) return pred_instances, {} def get_conv5_features(self, features): features = [features[f] for f in self.in_features] return self.res5(features[0]) def get_roi_features(self, features, proposals): assert len(self.in_features) == 1 features = [features[f] for f in self.in_features] box_features = self._shared_roi_transform( features, [x.proposal_boxes for x in proposals] ) pooled_features = box_features.mean(dim=[2, 3]) return box_features, pooled_features, None def predict_boxes(self, proposals, pred_proposal_deltas, num_preds_per_image): """ Returns: list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is the number of predicted objects for image i and B is the box dimension (4 or 5) """ # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. box_type = type(proposals[0].proposal_boxes) # cat(..., dim=0) concatenates over all images in the batch proposals = box_type.cat([p.proposal_boxes for p in proposals]) num_pred = len(proposals) B = proposals.tensor.shape[1] K = pred_proposal_deltas.shape[1] // B boxes = self.box2box_transform.apply_deltas( pred_proposal_deltas, proposals.tensor, ) return boxes.view(num_pred, K * B).split(num_preds_per_image, dim=0) def predict_probs(self, pred_class_logits, num_preds_per_image): """ Returns: list[Tensor]: A list of Tensors of predicted class probabilities for each image. Element i has shape (Ri, K + 1), where Ri is the number of predicted objects for image i. """ probs = F.softmax(pred_class_logits, dim=-1) probs = probs[:, :-1] # background is last return probs.split(num_preds_per_image, dim=0)
class AttributeRes5ROIHeads(AttributeROIHeads, Res5ROIHeads): """ An extension of Res5ROIHeads to include attribute prediction. """ def __init__(self, cfg, input_shape): super(Res5ROIHeads, self).__init__(cfg, input_shape) assert len(self.in_features) == 1 # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE pooler_scales = (1.0 / input_shape[self.in_features[0]].stride, ) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO self.mask_on = cfg.MODEL.MASK_ON self.attribute_on = cfg.MODEL.ATTRIBUTE_ON if self.attribute_on: self.attribute_thre = cfg.MODEL.ATTRIBUTE_THRE # fmt: on assert not cfg.MODEL.KEYPOINT_ON self.pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.res5, out_channels = self._build_res5_block(cfg) self.box_predictor = FastRCNNOutputLayers( cfg, ShapeSpec(channels=out_channels, height=1, width=1) ) if self.mask_on: self.mask_head = build_mask_head( cfg, ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), ) if self.attribute_on: self.attribute_predictor = AttributePredictor(cfg, out_channels) def forward(self, images, features, proposals, targets=None): del images if self.training: assert targets proposals = self.label_and_sample_proposals(proposals, targets) del targets proposal_boxes = [x.proposal_boxes for x in proposals] box_features = self._shared_roi_transform( [features[f] for f in self.in_features], proposal_boxes ) feature_pooled = box_features.mean(dim=[2, 3]) predictions = self.box_predictor(feature_pooled) if self.training: del features losses = self.box_predictor.losses(predictions, proposals) if self.mask_on: proposals, fg_selection_masks = select_foreground_proposals( proposals, self.num_classes ) mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] del box_features losses.update(self.mask_head(mask_features, proposals)) if self.attribute_on: losses.update(self.forward_attribute_loss(proposals, feature_pooled)) return [], losses else: pred_instances, chose_indices = self.box_predictor.inference(predictions, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) pred_instances[0].pred_attributes = [[] for _ in range(chose_indices[0].size(0))] pred_instances[0].attr_scores = [[] for _ in range(chose_indices[0].size(0))] if self.attribute_on and chose_indices[0].size(0) != 0: attr_labels, attr_scores = self.predict_attrs( feature_pooled[chose_indices], predictions[0][chose_indices], self.attribute_thre ) pred_instances[0].pred_attributes = attr_labels pred_instances[0].attr_scores = attr_scores return pred_instances, {} def predict_attrs(self, features, obj_probs, score_thresh=0.5): obj_labels = torch.argmax(obj_probs, dim=1) attribute_scores = self.attribute_predictor(features, obj_labels) attr_labels = torch.argmax(attribute_scores, dim=1) attr_scores = attribute_scores.gather(1, attr_labels.unsqueeze(1)) return attr_labels, attr_scores def get_conv5_features(self, features): features = [features[f] for f in self.in_features] return self.res5(features[0])
class WSRes5ROIHeads(ROIHeads): """ The ROIHeads in a typical "C4" R-CNN model, where the box and mask head share the cropping and the per-region feature computation by a Res5 block. """ def __init__(self, cfg, input_shape): super().__init__(cfg) # fmt: off self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE pooler_scales = (1.0 / input_shape[self.in_features[0]].stride, ) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO self.mask_on = cfg.MODEL.MASK_ON # fmt: on assert not cfg.MODEL.KEYPOINT_ON assert len(self.in_features) == 1 self.pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.res5, out_channels = self._build_res5_block(cfg) self.box_predictor = FastRCNNOutputLayers( cfg, ShapeSpec(channels=out_channels, height=1, width=1)) if self.mask_on: self.mask_head = build_mask_head( cfg, ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), ) def _build_res5_block(self, cfg): # fmt: off stage_channel_factor = 2**3 # res5 is 8x res2 num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP bottleneck_channels = num_groups * width_per_group * stage_channel_factor out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 norm = cfg.MODEL.RESNETS.NORM assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \ "Deformable conv is not yet supported in res5 head." # fmt: on blocks = make_stage( BottleneckBlock, 3, stride_per_block=[1, 1, 2], has_pool_per_block=[False, False, True], in_channels=out_channels // 2, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, norm=norm, stride_in_1x1=stride_in_1x1, ) return nn.Sequential(*blocks), out_channels def _shared_roi_transform(self, features, boxes): x = self.pooler(features, boxes) return self.res5(x) def forward(self, images, features, proposals, targets=None): """ See :meth:`ROIHeads.forward`. """ del images if self.training: assert targets proposals = self.label_and_sample_proposals(proposals, targets) del targets proposal_boxes = [x.proposal_boxes for x in proposals] box_features = self._shared_roi_transform( [features[f] for f in self.in_features], proposal_boxes) predictions = self.box_predictor(box_features.mean(dim=[2, 3])) if self.training: del features losses = self.box_predictor.losses(predictions, proposals) if self.mask_on: proposals, fg_selection_masks = select_foreground_proposals( proposals, self.num_classes) # Since the ROI feature transform is shared between boxes and masks, # we don't need to recompute features. The mask loss is only defined # on foreground proposals, so we need to select out the foreground # features. mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] del box_features losses.update(self.mask_head(mask_features, proposals)) return [], losses else: pred_instances, _ = self.box_predictor.inference( predictions, proposals) pred_instances = self.forward_with_given_boxes( features, pred_instances) return pred_instances, {} def forward_with_given_boxes(self, features, instances): """ Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. Args: features: same as in `forward()` instances (list[Instances]): instances to predict other outputs. Expect the keys "pred_boxes" and "pred_classes" to exist. Returns: instances (Instances): the same `Instances` object, with extra fields such as `pred_masks` or `pred_keypoints`. """ assert not self.training assert instances[0].has("pred_boxes") and instances[0].has( "pred_classes") if self.mask_on: features = [features[f] for f in self.in_features] x = self._shared_roi_transform(features, [x.pred_boxes for x in instances]) return self.mask_head(x, instances) else: return instances