def __init__(self, weights_path): self.model = SSD('test') self.model.cuda().eval() state = torch.load(weights_path, map_location=lambda storage, loc: storage) state = {key: value.float() for key, value in state.items()} self.model.load_state_dict(state) self.transform = GeneralizedRCNNTransform(DETECTOR_MIN_SIZE, DETECTOR_MAX_SIZE, DETECTOR_MEAN, DETECTOR_STD) self.transform.eval()
def _init_test_generalized_rcnn_transform(self): min_size = 100 max_size = 200 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) return transform
def test_not_float_normalize(self): transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3), torch.ones(3)) image = [torch.randint(0, 255, (3, 200, 300), dtype=torch.uint8)] targets = [{"boxes": torch.rand(3, 4)}] with pytest.raises(TypeError): out = transform(image, targets) # noqa: F841
def __init__(self_module): super(TransformModule, self_module).__init__() min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self_module.transform = GeneralizedRCNNTransform( min_size, max_size, image_mean, image_std)
def test_transform_copy_targets(self): transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3), torch.ones(3)) image = [torch.rand(3, 200, 300), torch.rand(3, 200, 200)] targets = [{'boxes': torch.rand(3, 4)}, {'boxes': torch.rand(2, 4)}] targets_copy = copy.deepcopy(targets) out = transform(image, targets) # noqa: F841 self.assertTrue(torch.equal(targets[0]['boxes'], targets_copy[0]['boxes'])) self.assertTrue(torch.equal(targets[1]['boxes'], targets_copy[1]['boxes']))
def test_transform_copy_targets(self): transform = GeneralizedRCNNTransform(300, 500, torch.zeros(3), torch.ones(3)) image = [torch.rand(3, 200, 300), torch.rand(3, 200, 200)] targets = [{"boxes": torch.rand(3, 4)}, {"boxes": torch.rand(2, 4)}] targets_copy = copy.deepcopy(targets) out = transform(image, targets) # noqa: F841 assert_equal(targets[0]["boxes"], targets_copy[0]["boxes"]) assert_equal(targets[1]["boxes"], targets_copy[1]["boxes"])
def __init__(self, root, img_transform, extra_info=True): self.root = root self.img_transform = img_transform self.extra_info = extra_info self.anno = pd.read_csv( os.path.join(self.root, 'annotation.csv')) self.target_transform = GeneralizedRCNNTransform( 400, 400, [0., 0., 0.], [1., 1., 1.])
class Detector(object): def __init__(self, weights_path): self.model = SSD('test') self.model.cuda().eval() state = torch.load(weights_path, map_location=lambda storage, loc: storage) state = {key: value.float() for key, value in state.items()} self.model.load_state_dict(state) self.transform = GeneralizedRCNNTransform(DETECTOR_MIN_SIZE, DETECTOR_MAX_SIZE, DETECTOR_MEAN, DETECTOR_STD) self.transform.eval() def detect(self, images): images = torch.stack( [torch.from_numpy(image).cuda() for image in images]) images = images.transpose(1, 3).transpose(2, 3).float() original_image_sizes = [img.shape[-2:] for img in images] images, _ = self.transform(images, None) with torch.no_grad(): detections_batch = self.model(images.tensors).cpu().numpy() result = [] for detections, image_size in zip(detections_batch, images.image_sizes): scores = detections[1, :, 0] keep_idxs = scores > DETECTOR_THRESHOLD detections = detections[1, keep_idxs, :] detections = detections[:, [1, 2, 3, 4, 0]] detections[:, 0] *= image_size[1] detections[:, 1] *= image_size[0] detections[:, 2] *= image_size[1] detections[:, 3] *= image_size[0] result.append({ 'scores': torch.from_numpy(detections[:, 4]), 'boxes': torch.from_numpy(detections[:, :4]) }) result = self.transform.postprocess(result, images.image_sizes, original_image_sizes) return result
def __init__(self, backbone, num_classes, min_size=800, max_size=1333, image_mean=None, image_std=None, anchor_generator=None, head=None, proposal_matcher=None, score_thresh=0.05, nms_thresh=0.5, detections_per_img=300, fg_iou_thresh=0.5, bg_iou_thresh=0.4, topk_candidates=1000): super(RetinaNet, self).__init__() if not hasattr(backbone, "out_channels"): raise ValueError("backbone should contain an attribute out_channels specifying the number of output channels " "assumed be the samefor all the levels") self.backbone = backbone assert isinstance(anchor_generator, (AnchorGenerator, type(None))) if anchor_generator is None: anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512]) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) self.anchor_generator = anchor_generator if head is None: head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes) self.head = head if proposal_matcher is None: proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches = True, ) self.proposal_matcher = proposal_matcher self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) self.score_thresh = score_thresh self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img self.topk_candidates = topk_candidates self.has_warned = False
def get_features_for_projection(model, imagePath, device): image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] # these transform parameters are from source code of Mask R CNN transform = GeneralizedRCNNTransform(min_size=800, max_size=1333, image_mean=image_mean, image_std=image_std) image = Image.open(imagePath) image_tensor = TF.to_tensor(image) # let it be in list (can be multiple) # TODO make it multiple images = [image_tensor] images, _ = transform(images) features = model.backbone(images.tensors.to(device)) features_to_be_projected = features['pool'] return features_to_be_projected
def get_features_for_projection_multi(model, imagePaths, device): image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] # these transform parameters are from source code of Mask R CNN transform = GeneralizedRCNNTransform(min_size=800, max_size=1333, image_mean=image_mean, image_std=image_std) images = [Image.open(imagePath) for imagePath in imagePaths] image_tensors = [TF.to_tensor(image) for image in images] # let it be in list (can be multiple) results = [] with torch.no_grad(): for tensor in image_tensors: images, _ = transform([tensor]) features = model.backbone(images.tensors.to(device)) features_to_be_projected = features['pool'] results.append(features_to_be_projected[0]) return results
def __init__(self, backbone, rpn, roi_heads, mask_net, transform, input_img_num=6, depth_estimator_path='_depth_net.pth'): super(GeneralizedRCNN, self).__init__() self.transform = transform self.backbone = UNet(4, 1) self.backbone_ = UNet(4, 64) self.input_img_num = input_img_num self.rpn = rpn self.roi_heads = roi_heads self.mask_net = UnetMask(6, 1) self.backbone_out_channels = 64 self.depth_estimator_path = depth_estimator_path self.depth_estimator = VggDepthEstimator() self.depth_estimator.load_state_dict( torch.load(self.depth_estimator_path)) self.depth_resize = nn.Upsample(size=(400, 400), mode='bilinear', align_corners=True) self.img_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((400, 400)), transforms.ToTensor(), transforms.Normalize(MEAN, STD) ]) self.target_transform = GeneralizedRCNNTransform( 400, 400, [0., 0., 0.], [1., 1., 1.]) self.mask_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((800, 800)), transforms.ToTensor() ]) self.depth_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((128, 416)), transforms.ToTensor(), transforms.Normalize(MEAN, STD) ])
def __init__(self, backbone, rpn, roi_heads, transform, input_img_num=6): super(DetectionGeneralizedRCNN, self).__init__() self.transform = transform self.backbone = backbone self.input_img_num = input_img_num self.rpn = rpn self.roi_heads = roi_heads self.backbone_out_channels = backbone.out_channels self.img_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((400, 400)), transforms.ToTensor(), transforms.Normalize(MEAN, STD) ]) self.target_transform = GeneralizedRCNNTransform( 400, 400, [0., 0., 0.], [1., 1., 1.])
def store_features_for_projection_multi(imagePaths, model_maskrcnn, outputs, sceneid): image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] # these transform parameters are from source code of Mask R CNN transform = GeneralizedRCNNTransform(min_size=800, max_size=1333, image_mean=image_mean, image_std=image_std) images = [Image.open(imagePath) for imagePath in imagePaths] image_tensors = [TF.to_tensor(image) for image in images] # let it be in list (can be multiple) images, _ = transform(image_tensors) with torch.no_grad(): body = model_maskrcnn.body body = body output = body(images.tensors) torch.save(output, outputs + sceneid + ".fea")
def __init__( self, backbone, num_classes=2, num_pids=5532, num_cq_size=5000, # transform parameters min_size=900, max_size=1500, image_mean=None, image_std=None, # Anchor settings: anchor_scales=None, anchor_ratios=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=12000, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters rcnn_bbox_bn=True, box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.4, box_detections_per_img=300, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.1, box_batch_size_per_image=128, box_positive_fraction=0.5, bbox_reg_weights=None, # ReID parameters feat_head=None, reid_head=None, reid_loss=None): if rpn_anchor_generator is None: anchor_sizes = ((32, 64, 128, 256, 512), ) aspect_ratios = ((0.5, 1.0, 2.0), ) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat2rpn'], output_size=[14, 14], sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 2048 box_head = GAP_BOX_HEAD(resolution, feat_head, representation_size) if box_predictor is None: representation_size = 2048 box_predictor = FastRCNNPredictor(representation_size, num_classes, RCNN_bbox_bn=False) if reid_head is None: reid_head = REID_HEAD(box_head.out_dims, 256) if reid_loss is None: reid_loss = OIMLoss(256, num_pids, num_cq_size, 0.5, 30) roi_heads = OIM_ROI_HEAD( reid_head, reid_loss, # box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN_OIM, self).__init__(backbone, rpn, roi_heads, transform)
def evaluate_yolo_2017(model, data_loader, device): n_threads = torch.get_num_threads() # FIXME remove this and make paste_masks_in_image run on the GPU torch.set_num_threads(1) cpu_device = torch.device("cpu") model.eval() metric_logger = utils.MetricLogger(delimiter=" ") header = 'Test:' coco = get_coco_api_from_dataset(data_loader.dataset) iou_types = _get_iou_types(model) coco_evaluator = CocoEvaluator(coco, iou_types) transform = GeneralizedRCNNTransform(416, 416, [0, 0, 0], [1, 1, 1]) transform.eval() for image, targets in metric_logger.log_every(data_loader, 100, header): image = list(img.to(device) for img in image) original_image_sizes = [img.shape[-2:] for img in image] targets = [{k: v.to(device) for k, v in t.items()} for t in targets] torch.cuda.synchronize() model_time = time.time() transformed_img = transform(image) transformed_shape = transformed_img[0].tensors.shape[-2:] inf_out, _ = model(transformed_img[0].tensors) # Run NMS output = non_max_suppression(inf_out, conf_thres=0.001, iou_thres=0.6) # Statistics per image predictions = [] for si, pred in enumerate(output): prediction = {'boxes': [], 'labels': [], 'scores': []} if pred is None: continue # Append to text file # with open('test.txt', 'a') as file: # [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred] # Clip boxes to image bounds clip_coords(pred, transformed_shape) # Append to pycocotools JSON dictionary # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... image_id = int(targets[si]['image_id']) box = pred[:, :4].clone() # xyxy # scale_coords(transformed_shape, box, shapes[si][0], shapes[si][1]) # to original shape # box = xyxy2xywh(box) # xywh # box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner for di, d in enumerate(pred): box_T = [floatn(x, 3) for x in box[di]] label = coco91class[int(d[5])] score = floatn(d[4], 5) prediction['boxes'].append(box_T) prediction['labels'].append(label) prediction['scores'].append(score) prediction['boxes'] = torch.tensor(prediction['boxes']) prediction['labels'] = torch.tensor(prediction['labels']) prediction['scores'] = torch.tensor(prediction['scores']) predictions.append(prediction) outputs = transform.postprocess(predictions, transformed_img[0].image_sizes, original_image_sizes) outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in predictions] model_time = time.time() - model_time res = { target["image_id"].item(): output for target, output in zip(targets, outputs) } evaluator_time = time.time() coco_evaluator.update(res) evaluator_time = time.time() - evaluator_time metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) coco_evaluator.synchronize_between_processes() # accumulate predictions from all images coco_evaluator.accumulate() coco_evaluator.summarize() torch.set_num_threads(n_threads) return coco_evaluator
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=800, image_mean=None, image_std=None, # RPN parameters anchor_generator=None, # Box parameters box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.15, box_bg_iou_thresh=0.15, box_batch_size_per_image=50, box_positive_fraction=0.5, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if anchor_generator is None: # anchor size per every feature map level anchor_sizes = ((16, ), (32, ), (64, ), (128, ), (210, ), (320, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) num_anchors_per_location = anchor_generator.num_anchors_per_location( )[0] ssd_predictor = SSDPredictor(out_channels, num_classes, num_anchors_per_location) ssd_head = SSDHead( # Box ssd_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights) bbox_reg_weights = ssd_head.bbox_reg_weight detection_filter = DetectionNmsPostprocessor(box_score_thresh, box_nms_thresh, bbox_reg_weights, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(SSD, self).__init__(transform, backbone, anchor_generator, ssd_head, detection_filter, num_classes)
def __init__(self, cfg): super(SeqNet, self).__init__() backbone, box_head = build_resnet(name="resnet50", pretrained=True) anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), )) head = RPNHead( in_channels=backbone.out_channels, num_anchors=anchor_generator.num_anchors_per_location()[0], ) pre_nms_top_n = dict(training=cfg.MODEL.RPN.PRE_NMS_TOPN_TRAIN, testing=cfg.MODEL.RPN.PRE_NMS_TOPN_TEST) post_nms_top_n = dict(training=cfg.MODEL.RPN.POST_NMS_TOPN_TRAIN, testing=cfg.MODEL.RPN.POST_NMS_TOPN_TEST) rpn = RegionProposalNetwork( anchor_generator=anchor_generator, head=head, fg_iou_thresh=cfg.MODEL.RPN.POS_THRESH_TRAIN, bg_iou_thresh=cfg.MODEL.RPN.NEG_THRESH_TRAIN, batch_size_per_image=cfg.MODEL.RPN.BATCH_SIZE_TRAIN, positive_fraction=cfg.MODEL.RPN.POS_FRAC_TRAIN, pre_nms_top_n=pre_nms_top_n, post_nms_top_n=post_nms_top_n, nms_thresh=cfg.MODEL.RPN.NMS_THRESH, ) faster_rcnn_predictor = FastRCNNPredictor(2048, 2) reid_head = deepcopy(box_head) box_roi_pool = MultiScaleRoIAlign(featmap_names=["feat_res4"], output_size=14, sampling_ratio=2) box_predictor = BBoxRegressor(2048, num_classes=2, bn_neck=cfg.MODEL.ROI_HEAD.BN_NECK) roi_heads = SeqRoIHeads( # OIM num_pids=cfg.MODEL.LOSS.LUT_SIZE, num_cq_size=cfg.MODEL.LOSS.CQ_SIZE, oim_momentum=cfg.MODEL.LOSS.OIM_MOMENTUM, oim_scalar=cfg.MODEL.LOSS.OIM_SCALAR, # SeqNet faster_rcnn_predictor=faster_rcnn_predictor, reid_head=reid_head, # parent class box_roi_pool=box_roi_pool, box_head=box_head, box_predictor=box_predictor, fg_iou_thresh=cfg.MODEL.ROI_HEAD.POS_THRESH_TRAIN, bg_iou_thresh=cfg.MODEL.ROI_HEAD.NEG_THRESH_TRAIN, batch_size_per_image=cfg.MODEL.ROI_HEAD.BATCH_SIZE_TRAIN, positive_fraction=cfg.MODEL.ROI_HEAD.POS_FRAC_TRAIN, bbox_reg_weights=None, score_thresh=cfg.MODEL.ROI_HEAD.SCORE_THRESH_TEST, nms_thresh=cfg.MODEL.ROI_HEAD.NMS_THRESH_TEST, detections_per_img=cfg.MODEL.ROI_HEAD.DETECTIONS_PER_IMAGE_TEST, ) transform = GeneralizedRCNNTransform( min_size=cfg.INPUT.MIN_SIZE, max_size=cfg.INPUT.MAX_SIZE, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225], ) self.backbone = backbone self.rpn = rpn self.roi_heads = roi_heads self.transform = transform # loss weights self.lw_rpn_reg = cfg.SOLVER.LW_RPN_REG self.lw_rpn_cls = cfg.SOLVER.LW_RPN_CLS self.lw_proposal_reg = cfg.SOLVER.LW_PROPOSAL_REG self.lw_proposal_cls = cfg.SOLVER.LW_PROPOSAL_CLS self.lw_box_reg = cfg.SOLVER.LW_BOX_REG self.lw_box_cls = cfg.SOLVER.LW_BOX_CLS self.lw_box_reid = cfg.SOLVER.LW_BOX_REID
def __init__( self, num_classes=2, # transform parameters backbone_name='resnet50', min_size=256, max_size=512, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_score_thresh=0.0, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Ellipse regressor ellipse_roi_pool=None, ellipse_head=None, ellipse_predictor=None, ellipse_loss_metric="gaussian-angle"): backbone = resnet_fpn_backbone(backbone_name, pretrained=True, trainable_layers=5) # Input image is grayscale -> in_channels = 1 instead of 3 (COCO) backbone.body.conv1 = Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, score_thresh=rpn_score_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if ellipse_roi_pool is None: ellipse_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if ellipse_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 ellipse_head = TwoMLPHead(out_channels * resolution**2, representation_size) if ellipse_predictor is None: representation_size = 1024 ellipse_predictor = EllipseRegressor(representation_size, num_classes) roi_heads = EllipseRoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, # Ellipse ellipse_roi_pool=ellipse_roi_pool, ellipse_head=ellipse_head, ellipse_predictor=ellipse_predictor, ellipse_loss_metric=ellipse_loss_metric) if image_mean is None: image_mean = [0.156] if image_std is None: image_std = [0.272] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, rpn, roi_heads, transform)
class RetinaNet(nn.Module): """ Implements RetinaNet. The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each image, and should be in 0-1 range. Different images can have different sizes. The behavior of the model changes depending if it is in training or evaluation mode. During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box The model returns a Dict[Tensor] during training, containing the classification and regression losses. During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each image - scores (Tensor[N]): the scores for each prediction Args: backbone (nn.Module): the network used to compute the features for the model. It should contain an out_channels attribute, which indicates the number of output channels that each feature map has (and it should be the same for all feature maps). The backbone should return a single Tensor or an OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). min_size (int): minimum size of the image to be rescaled before feeding it to the backbone max_size (int): maximum size of the image to be rescaled before feeding it to the backbone image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on image_std (Tuple[float, float, float]): std values used for input normalization. They are generally the std values of the dataset on which the backbone has been trained on anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature maps. head (nn.Module): Module run on top of the feature pyramid. Defaults to a module containing a classification and regression module. score_thresh (float): Score threshold used for postprocessing the detections. nms_thresh (float): NMS threshold used for postprocessing the detections. detections_per_img (int): Number of best detections to keep after NMS. fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be considered as positive during training. bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be considered as negative during training. topk_candidates (int): Number of best detections to keep before NMS. Example: >>> import torch >>> import torchvision >>> from torchvision.models.detection import RetinaNet >>> from torchvision.models.detection.anchor_utils import AnchorGenerator >>> # load a pre-trained model for classification and return >>> # only the features >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features >>> # RetinaNet needs to know the number of >>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # so we need to add it here >>> backbone.out_channels = 1280 >>> >>> # let's make the network generate 5 x 3 anchors per spatial >>> # location, with 5 different sizes and 3 different aspect >>> # ratios. We have a Tuple[Tuple[int]] because each feature >>> # map could potentially have different sizes and >>> # aspect ratios >>> anchor_generator = AnchorGenerator( >>> sizes=((32, 64, 128, 256, 512),), >>> aspect_ratios=((0.5, 1.0, 2.0),) >>> ) >>> >>> # put the pieces together inside a RetinaNet model >>> model = RetinaNet(backbone, >>> num_classes=2, >>> anchor_generator=anchor_generator) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) """ __annotations__ = { 'box_coder': det_utils.BoxCoder, 'proposal_matcher': det_utils.Matcher, } def __init__( self, backbone, num_classes, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # Anchor parameters anchor_generator=None, head=None, proposal_matcher=None, score_thresh=0.05, nms_thresh=0.5, detections_per_img=300, fg_iou_thresh=0.5, bg_iou_thresh=0.4, topk_candidates=1000): super().__init__() if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") self.backbone = backbone assert isinstance(anchor_generator, (AnchorGenerator, type(None))) if anchor_generator is None: anchor_sizes = tuple( (x, int(x * 2**(1.0 / 3)), int(x * 2**(2.0 / 3))) for x in [32, 64, 128, 256, 512]) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) self.anchor_generator = anchor_generator if head is None: head = RetinaNetHead( backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes) self.head = head if proposal_matcher is None: proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=True, ) self.proposal_matcher = proposal_matcher self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) self.score_thresh = score_thresh self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img self.topk_candidates = topk_candidates # used only on torchscript mode self._has_warned = False @torch.jit.unused def eager_outputs(self, losses, detections): # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] if self.training: return losses return detections def compute_loss(self, targets, head_outputs, anchors): # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Dict[str, Tensor] matched_idxs = [] for anchors_per_image, targets_per_image in zip(anchors, targets): if targets_per_image['boxes'].numel() == 0: matched_idxs.append( torch.full((anchors_per_image.size(0), ), -1, dtype=torch.int64, device=anchors_per_image.device)) continue match_quality_matrix = box_ops.box_iou(targets_per_image['boxes'], anchors_per_image) matched_idxs.append(self.proposal_matcher(match_quality_matrix)) return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs) def postprocess_detections(self, head_outputs, anchors, image_shapes): # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]] class_logits = head_outputs['cls_logits'] box_regression = head_outputs['bbox_regression'] num_images = len(image_shapes) detections: List[Dict[str, Tensor]] = [] for index in range(num_images): box_regression_per_image = [br[index] for br in box_regression] logits_per_image = [cl[index] for cl in class_logits] anchors_per_image, image_shape = anchors[index], image_shapes[ index] image_boxes = [] image_scores = [] image_labels = [] for box_regression_per_level, logits_per_level, anchors_per_level in \ zip(box_regression_per_image, logits_per_image, anchors_per_image): num_classes = logits_per_level.shape[-1] # remove low scoring boxes scores_per_level = torch.sigmoid(logits_per_level).flatten() keep_idxs = scores_per_level > self.score_thresh scores_per_level = scores_per_level[keep_idxs] topk_idxs = torch.where(keep_idxs)[0] # keep only topk scoring predictions num_topk = min(self.topk_candidates, topk_idxs.size(0)) scores_per_level, idxs = scores_per_level.topk(num_topk) topk_idxs = topk_idxs[idxs] anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode='floor') labels_per_level = topk_idxs % num_classes boxes_per_level = self.box_coder.decode_single( box_regression_per_level[anchor_idxs], anchors_per_level[anchor_idxs]) boxes_per_level = box_ops.clip_boxes_to_image( boxes_per_level, image_shape) image_boxes.append(boxes_per_level) image_scores.append(scores_per_level) image_labels.append(labels_per_level) image_boxes = torch.cat(image_boxes, dim=0) image_scores = torch.cat(image_scores, dim=0) image_labels = torch.cat(image_labels, dim=0) # non-maximum suppression keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh) keep = keep[:self.detections_per_img] detections.append({ 'boxes': image_boxes[keep], 'scores': image_scores[keep], 'labels': image_labels[keep], }) return detections def forward(self, images, targets=None): # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] """ Args: images (list[Tensor]): images to be processed targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ if self.training and targets is None: raise ValueError("In training mode, targets should be passed") if self.training: assert targets is not None for target in targets: boxes = target["boxes"] if isinstance(boxes, torch.Tensor): if len(boxes.shape) != 2 or boxes.shape[-1] != 4: raise ValueError("Expected target boxes to be a tensor" "of shape [N, 4], got {:}.".format( boxes.shape)) else: raise ValueError("Expected target boxes to be of type " "Tensor, got {:}.".format(type(boxes))) # get the original image sizes original_image_sizes: List[Tuple[int, int]] = [] for img in images: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) # transform the input images, targets = self.transform(images, targets) # Check for degenerate boxes # TODO: Move this to a function if targets is not None: for target_idx, target in enumerate(targets): boxes = target["boxes"] degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] if degenerate_boxes.any(): # print the first degenerate box bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0] degen_bb: List[float] = boxes[bb_idx].tolist() raise ValueError( "All bounding boxes should have positive height and width." " Found invalid box {} for target at index {}.".format( degen_bb, target_idx)) # get the features from the backbone features = self.backbone(images.tensors) if isinstance(features, torch.Tensor): features = OrderedDict([('0', features)]) # TODO: Do we want a list or a dict? features = list(features.values()) for idx in range(len(features)): features[idx] = features[idx].to(torch.float32) # compute the retinanet heads outputs using the features head_outputs = self.head(features) for key in head_outputs: head_outputs[key] = head_outputs[key].to(torch.float32) # create the set of anchors anchors = self.anchor_generator(images, features) losses = {} detections: List[Dict[str, Tensor]] = [] if self.training: assert targets is not None # compute the losses losses = self.compute_loss(targets, head_outputs, anchors) else: # recover level sizes num_anchors_per_level = [x.size(2) * x.size(3) for x in features] HW = 0 for v in num_anchors_per_level: HW += v HWA = head_outputs['cls_logits'].size(1) A = HWA // HW num_anchors_per_level = [hw * A for hw in num_anchors_per_level] # split outputs per level split_head_outputs: Dict[str, List[Tensor]] = {} for k in head_outputs: split_head_outputs[k] = list(head_outputs[k].split( num_anchors_per_level, dim=1)) split_anchors = [ list(a.split(num_anchors_per_level)) for a in anchors ] # compute the detections detections = self.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn( "RetinaNet always returns a (Losses, Detections) tuple in scripting" ) self._has_warned = True return losses, detections return self.eager_outputs(losses, detections)
from torchvision.datasets.coco import CocoDetection from torchvision.models.detection.transform import GeneralizedRCNNTransform from torchvision.transforms import transforms from pdetection import transform from torch.utils.data.dataloader import DataLoader import torch from pdetection import utils path_data = '/home/peng/Documents/srch/Object Detection/dataset/coco/val2017' path_anno = '/home/peng/Documents/srch/Object Detection/dataset/coco/annotations_trainval2017/annotations/instances_val2017.json' coco_dset = CocoDetection(root=path_data, annFile=path_anno) trans = transform.ODTransformer(800, 1333, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) trans_compare = GeneralizedRCNNTransform(800, 1333, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # trans = transform.ODTransformer(800, 1333, [0.4, 0.5, 0.6], [0.5, 1, 2]) cocoloader = DataLoader(coco_dset, batch_size=3, collate_fn=utils.collate_fn) for data in cocoloader: images, targets = data inputimgs = [] totensor = transforms.ToTensor() for img in images: inputimgs.append(totensor(img)) src_img_sizes = [img.shape[-2:] for img in inputimgs] # print([img.shape[-2:] for img in inputimgs]) targets1 = utils.totargets(targets) targets2 = utils.totargets(targets) print(targets2[2]['boxes']) inputimgs_bk = [torch.zeros_like(img).copy_(img) for img in inputimgs]
def __init__( self, backbone, num_classes=None, num_pids=5532, num_cq_size=5000, # transform parameters min_size=900, max_size=1500, image_mean=None, image_std=None, # Anchor settings: anchor_scales=None, anchor_ratios=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=12000, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, feat_head=None, box_predictor=None, box_score_thresh=0.0, box_nms_thresh=0.4, box_detections_per_img=300, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.1, box_batch_size_per_image=128, box_positive_fraction=0.5, bbox_reg_weights=None, # ReID parameters embedding_head=None, reid_loss=None): if not hasattr(backbone, "out_channels"): raise ValueError( 'backbone should contain an attribute out_channels ' 'specifying the number of output channels (assumed to be the ' 'same for all the levels)') assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( 'num_classes should be None when box_predictor is specified' ) else: if box_predictor is None: raise ValueError( 'num_classes should not be None when box_predictor' 'is not specified') out_channels = backbone.out_channels if rpn_anchor_generator is None: if anchor_scales is None: anchor_scales = ((32, 64, 128, 256, 512), ) if anchor_ratios is None: anchor_ratios = ((0.5, 1.0, 2.0), ) rpn_anchor_generator = AnchorGenerator(anchor_scales, anchor_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = self._set_rpn(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat_res4'], output_size=14, sampling_ratio=2) if feat_head is None: raise ValueError('feat_head should be specified manually.') # resolution = box_roi_pool.output_size[0] # representation_size = 2048 # # ConvHead should be part of the backbone # # feat_head = TwoMLPHead( # # out_channels * resolution ** 2, # # representation_size) if box_predictor is None: box_predictor = CoordRegressor(2048, num_classes) if embedding_head is None: embedding_head = ReIDEmbeddingProj( featmap_names=['feat_res4', 'feat_res5'], in_channels=[1024, 2048], dim=256) if reid_loss is None: reid_loss = HOIMLoss(256, num_pids, num_cq_size, 0.5, 30.0) roi_heads = self._set_roi_heads( embedding_head, reid_loss, box_roi_pool, feat_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN_HOIM, self).__init__(backbone, rpn, roi_heads, transform)
def __init__( self, backbone, n_channel_backbone=5, num_classes=None, # transform parameters min_size=800, max_size=1333, #min_size=720, max_size=1280, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.5, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, anchor_sizes=[32, 64, 128, 256, 512], # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.4, box_detections_per_img=30, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, weight_loss=False, use_soft_nms=False, use_context=False, use_track_branch=False): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: ratios = ((0.5, 1.0, 2.0), ) aspect_ratios = ratios * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, weight_loss) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if n_channel_backbone == 6: box_roi_pool = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3, 4], output_size=7, sampling_ratio=2) representation_size1 = 1024 representation_size2 = 1024 track_embedding_size = 1024 if box_head is None: resolution = box_roi_pool.output_size[0] if use_context: box_head = TwoMLPHead(2 * out_channels * resolution**2, representation_size1, representation_size2) else: box_head = TwoMLPHead(out_channels * resolution**2, representation_size1, representation_size2) if use_track_branch: if use_context: track_embedding = TwoMLPHead(2 * out_channels * resolution**2, representation_size1, track_embedding_size) else: track_embedding = TwoMLPHead(out_channels * resolution**2, representation_size1, track_embedding_size) else: track_embedding = None if box_predictor is None: box_predictor = FastRCNNPredictor(representation_size1, num_classes) if num_classes > 2: use_soft_nms = False roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, weight_loss=weight_loss, use_soft_nms=use_soft_nms, use_context=use_context) if use_track_branch: track_heads = TrackHeads(box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, weight_loss=False, use_context=False, track_embedding=track_embedding) else: track_heads = None if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, track_heads, transform, n_channel_backbone)
def __init__( self, backbone: nn.Module, num_classes: int, anchor_grids: List[List[int]], # transform parameters min_size: int = 320, max_size: int = 416, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, # Anchor parameters anchor_generator: Optional[nn.Module] = None, head: Optional[nn.Module] = None, # Training parameter compute_loss: Optional[nn.Module] = None, fg_iou_thresh: float = 0.5, bg_iou_thresh: float = 0.4, # Post Process parameter postprocess_detections: Optional[nn.Module] = None, score_thresh: float = 0.05, nms_thresh: float = 0.5, detections_per_img: int = 300, ): super().__init__() if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") self.backbone = backbone if anchor_generator is None: strides: List[int] = [8, 16, 32] anchor_generator = AnchorGenerator(strides, anchor_grids) self.anchor_generator = anchor_generator if compute_loss is None: compute_loss = SetCriterion( weights=(1.0, 1.0, 1.0, 1.0), fg_iou_thresh=fg_iou_thresh, bg_iou_thresh=bg_iou_thresh, ) self.compute_loss = compute_loss if head is None: head = YoloHead( backbone.out_channels, anchor_generator.num_anchors, num_classes, ) self.head = head if image_mean is None: image_mean = [0., 0., 0.] if image_std is None: image_std = [1., 1., 1.] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) if postprocess_detections is None: postprocess_detections = PostProcess(score_thresh, nms_thresh, detections_per_img) self.postprocess_detections = postprocess_detections # used only on torchscript mode self._has_warned = False
default=8, help='Index to the dataset for an example') parser.add_argument('--outdir', type=str, default='examples_detection', help='Folder for output images') if __name__ == '__main__': args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' root = args.root annfile = args.annfile # Load a maskRCNN finetuned on our birds network_transform = GeneralizedRCNNTransform(800, 1333, (0, 0, 0), (1, 1, 1)) backbone = resnet_fpn_backbone(backbone_name='resnet101', pretrained=False) model = MaskRCNN(backbone, num_classes=2) model.transform = network_transform model.eval() model.load_state_dict(torch.load('models/detector.pth')) model.to(device) # Load a data split normalize = T.Normalize(mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.]) coco = COCO(annfile) # Load an image example available_Ids = coco.getImgIds() imgfile = coco.loadImgs(available_Ids[args.index])[0]['file_name']
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
import torch import torchvision from torch.jit.annotations import Tuple, List, Dict, Optional import numpy as np import cv2 import dataset images, targets = dataset.load_data() to_tensor = torchvision.transforms.ToTensor() images = [to_tensor(image) for image in images] targets = [{ 'boxes': item['boxes'], 'labels': item['labels'] } for item in targets] min_size = [800, 820, 900] max_size = 1333 from torchvision.models.detection.transform import GeneralizedRCNNTransform ## transform image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) transform.train() print(transform.training) images, targets = transform(images, targets) print(images.tensors.shape) print(images.image_sizes) print(len(targets)) print(targets[0])
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.3, box_detections_per_img=128, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=64, box_positive_fraction=0.25, bbox_reg_weights=None): print("Using modified Faster RCNN....") if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) rpn = None roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, rpn, roi_heads, transform)
import pickle import random import argparse import numpy as np from datetime import datetime import torch import torch.nn as nn import torch.optim as optim from models import get_model from datasets import get_loader from helper import compute_ts_road_map, compute_ats_bounding_boxes from torchvision.models.detection.transform import GeneralizedRCNNTransform target_transform = GeneralizedRCNNTransform(800, 800, [0., 0., 0.], [1., 1., 1.]) def get_mask_ts(mask, target): mask = nn.Sigmoid()(mask) > 0.5 temp_tensor = torch.zeros(1, 3, 400, 400) temp_target = [{'masks': mask, 'boxes': torch.tensor([[1., 1., 1., 1.]])}] _, temp_target = target_transform(temp_tensor, temp_target) predicted_road_map = temp_target[0]['masks'][0, :1] ts_road_map = compute_ts_road_map(predicted_road_map, target[0]['masks']) return ts_road_map def get_detection_ts(detection, target):
class YOLO(nn.Module): def __init__( self, backbone: nn.Module, num_classes: int, anchor_grids: List[List[int]], # transform parameters min_size: int = 320, max_size: int = 416, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, # Anchor parameters anchor_generator: Optional[nn.Module] = None, head: Optional[nn.Module] = None, # Training parameter compute_loss: Optional[nn.Module] = None, fg_iou_thresh: float = 0.5, bg_iou_thresh: float = 0.4, # Post Process parameter postprocess_detections: Optional[nn.Module] = None, score_thresh: float = 0.05, nms_thresh: float = 0.5, detections_per_img: int = 300, ): super().__init__() if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") self.backbone = backbone if anchor_generator is None: strides: List[int] = [8, 16, 32] anchor_generator = AnchorGenerator(strides, anchor_grids) self.anchor_generator = anchor_generator if compute_loss is None: compute_loss = SetCriterion( weights=(1.0, 1.0, 1.0, 1.0), fg_iou_thresh=fg_iou_thresh, bg_iou_thresh=bg_iou_thresh, ) self.compute_loss = compute_loss if head is None: head = YoloHead( backbone.out_channels, anchor_generator.num_anchors, num_classes, ) self.head = head if image_mean is None: image_mean = [0., 0., 0.] if image_std is None: image_std = [1., 1., 1.] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) if postprocess_detections is None: postprocess_detections = PostProcess(score_thresh, nms_thresh, detections_per_img) self.postprocess_detections = postprocess_detections # used only on torchscript mode self._has_warned = False @torch.jit.unused def eager_outputs( self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]], ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: if self.training: return losses return detections def forward( self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None, ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: """ Arguments: images (list[Tensor]): images to be processed targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During Training, it returns a dict[Tensor] which contains the losses TODO, currently this repo doesn't support training. During Testing, it returns list[BoxList] contains additional fields like `scores` and `labels`. """ # get the original image sizes original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], []) for img in images: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) # transform the input images, targets = self.transform(images, targets) # get the features from the backbone features = self.backbone(images.tensors) # compute the yolo heads outputs using the features head_outputs = self.head(features) # create the set of anchors anchors_tuple = self.anchor_generator(features) losses = {} detections = torch.jit.annotate(List[Dict[str, Tensor]], []) if self.training: assert targets is not None # compute the losses losses = self.compute_loss(targets, head_outputs, anchors_tuple[0]) else: # compute the detections detections = self.postprocess_detections(head_outputs, anchors_tuple, images.image_sizes) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn("YOLO always returns a (Losses, Detections) tuple in scripting") self._has_warned = True return losses, detections else: return self.eager_outputs(losses, detections)