def test_validate_resnet_inputs_detection(self): # default number of backbone layers to train ret = backbone_utils._validate_trainable_layers( pretrained=True, trainable_backbone_layers=None, max_value=5, default_value=3) assert ret == 3 # can't go beyond 5 with pytest.raises( ValueError, match=r"Trainable backbone layers should be in the range"): ret = backbone_utils._validate_trainable_layers( pretrained=True, trainable_backbone_layers=6, max_value=5, default_value=3) # if not pretrained, should use all trainable layers and warn with pytest.warns(UserWarning): ret = backbone_utils._validate_trainable_layers( pretrained=False, trainable_backbone_layers=0, max_value=5, default_value=3) assert ret == 5
def test_validate_resnet_inputs_detection(self): # default number of backbone layers to train ret = backbone_utils._validate_trainable_layers( pretrained=True, trainable_backbone_layers=None, max_value=5, default_value=3) self.assertEqual(ret, 3) # can't go beyond 5 with self.assertRaises(AssertionError): ret = backbone_utils._validate_trainable_layers( pretrained=True, trainable_backbone_layers=6, max_value=5, default_value=3) # if not pretrained, should use all trainable layers and warn with self.assertWarns(UserWarning): ret = backbone_utils._validate_trainable_layers( pretrained=False, trainable_backbone_layers=0, max_value=5, default_value=3) self.assertEqual(ret, 5)
def _yolov5_mobilenet_v3_small_fpn( weights_name: str, pretrained: bool = False, progress: bool = True, num_classes: int = 80, pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs, ): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 6, 3) if pretrained: pretrained_backbone = False backbone = mobilenet_backbone( "mobilenet_v3_small", pretrained_backbone, trainable_layers=trainable_backbone_layers, ) strides = [8, 16, 32, 64] anchor_grids = [ [19, 27, 44, 40, 38, 94], [96, 68, 86, 152, 180, 137], [140, 301, 303, 264, 238, 542], [436, 615, 739, 380, 925, 792], ] anchor_generator = AnchorGenerator(strides, anchor_grids) head = YOLOHead( backbone.out_channels, anchor_generator.num_anchors, anchor_generator.strides, num_classes, ) model = YOLO(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs) if pretrained: if model_urls.get(weights_name, None) is None: raise ValueError( f"No checkpoint is available for model {weights_name}") state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) model.load_state_dict(state_dict) return model
def backbone1(self, pretrained_backbone, pretrained=False, trainable_backbone_layers=None): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) if pretrained: # no need to download the backbone if pretrained is set pretrained_backbone = False # skip P2 because it generates too many anchors (according to their paper) backbone = resnet_fpn_backbone( 'resnet18', pretrained_backbone, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(256, 256), trainable_layers=trainable_backbone_layers) return backbone
def my_fasterrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=21, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) backbone = SimCLRModel() backbone = my_resnet_fpn_backbone( backbone, pretrained_backbone, trainable_layers=trainable_backbone_layers) out_channels = 256 backbone.out_channels = out_channels model = FasterRCNN(backbone, num_classes, **kwargs) return model
def _fasterrcnn_mobilenet_v3_large_fpn(weights_name, pretrained=False, progress=True, num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 6, 3) if pretrained: pretrained_backbone = False backbone = mobilenet_backbone("mobilenet_v3_large", pretrained_backbone, True, trainable_layers=trainable_backbone_layers) anchor_sizes = (( 32, 64, 128, 256, 512, ), ) * 3 aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) model = FasterRCNN(backbone, num_classes, rpn_anchor_generator=AnchorGenerator( anchor_sizes, aspect_ratios), **kwargs) if pretrained: if model_urls.get(weights_name, None) is None: raise ValueError( "No checkpoint is available for model {}".format(weights_name)) state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) model.load_state_dict(state_dict) return model
def retinanet_resnet50_fpn(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): """ Constructs a RetinaNet model with a ResNet-50-FPN backbone. Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_. The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each image, and should be in ``0-1`` range. Different images can have different sizes. The behavior of the model changes depending if it is in training or evaluation mode. During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box The model returns a ``Dict[Tensor]`` during training, containing the classification and regression losses. During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows, where ``N`` is the number of detections: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each detection - scores (``Tensor[N]``): the scores of each detection For more details on the output, you may refer to :ref:`instance_seg_output`. Example:: >>> model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) Args: pretrained (bool): If True, returns a model pre-trained on COCO train2017 progress (bool): If True, displays a progress bar of the download to stderr num_classes (int): number of output classes of the model (including the background) pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. """ trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) if pretrained: # no need to download the backbone if pretrained is set pretrained_backbone = False # skip P2 because it generates too many anchors (according to their paper) backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(256, 256), trainable_layers=trainable_backbone_layers) model = RetinaNet(backbone, num_classes, **kwargs) if pretrained: state_dict = load_state_dict_from_url( model_urls['retinanet_resnet50_fpn_coco'], progress=progress) model.load_state_dict(state_dict) overwrite_eps(model, 0.0) return model
def get_model(config): model = None # input_size = 0 if config.model_name == "resnet": """ Resnet34 """ model = models.resnet18(pretrained=config.use_pretrained) set_parameter_requires_grad(model, config.freeze) n_features = model.fc.in_features model.fc = nn.Linear(n_features, config.n_classes) # input_size = 224 elif config.model_name == "alexnet": """ Alexnet """ model = models.alexnet(pretrained=config.use_pretrained) set_parameter_requires_grad(model, config.freeze) n_features = model.classifier[-1].in_features model.classifier[-1] = nn.Linear(n_features, config.n_classes) # input_size = 224 elif config.model_name == "vgg": """ VGG16_bn """ model = models.vgg16_bn(pretrained=config.use_pretrained) set_parameter_requires_grad(model, config.freeze) n_features = model.classifier[-1].in_features model.classifier[-1] = nn.Linear(n_features, config.n_classes) # input_size = 224 elif config.model_name == "densenet": """ Densenet """ model = models.densenet121(pretrained=config.use_pretrained) set_parameter_requires_grad(model, config.freeze) n_features = model.classifier.in_features model.classifier = nn.Linear(n_features, config.n_classes) # input_size = 224 elif config.model_name == 'mobilenet': model = models.mobilenet_v2(pretrained = config.use_pretrained) set_parameter_requires_grad(model, config.freeze) n_features = model.classifier[-1].in_features model.classifier[-1] = nn.Linear(n_features, config.n_classes) n_features elif config.model_name == "KeypointRCNN": backbone = models.mobilenet_v2(pretrained=True).features backbone.out_channels = 1280 roi_pooler = MultiScaleRoIAlign( featmap_names=['0'], output_size=7, sampling_ratio=2 ) anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), aspect_ratios=((0.5, 1.0, 2.0),)) keypoint_roi_pooler = MultiScaleRoIAlign( featmap_names=['0'], output_size=14, sampling_ratio=2 ) model = KeypointRCNN( backbone, num_classes=2, num_keypoints=24, box_roi_pool=roi_pooler, keypoint_roi_pool=keypoint_roi_pooler,rpn_anchor_generator=anchor_generator ) elif config.model_name == "keypointrcnn_resnet50": model = models.detection.keypointrcnn_resnet50_fpn(pretrained=config.use_pretrained, progress=False) model.roi_heads.keypoint_predictor.kps_score_lowres = nn.ConvTranspose2d(512, 24, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1)) elif config.model_name == "keypointrcnn_resnet101": pretrained_backbone = True pretrained = False trainable_backbone_layers = None trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) backbone = resnet_fpn_backbone('resnet101', pretrained_backbone, trainable_layers=trainable_backbone_layers) model = KeypointRCNN( backbone, num_classes=2, num_keypoints=24) else: raise NotImplementedError('You need to specify model name.') return model
def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): """ Constructs a Faster R-CNN model with a ResNet-50-FPN backbone. Reference: `"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" <https://arxiv.org/abs/1506.01497>`_. The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each image, and should be in ``0-1`` range. Different images can have different sizes. The behavior of the model changes depending if it is in training or evaluation mode. During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box The model returns a ``Dict[Tensor]`` during training, containing the classification and regression losses for both the RPN and the R-CNN. During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows, where ``N`` is the number of detections: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each detection - scores (``Tensor[N]``): the scores of each detection For more details on the output, you may refer to :ref:`instance_seg_output`. Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size. Example:: >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) >>> # For training >>> images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4) >>> labels = torch.randint(1, 91, (4, 11)) >>> images = list(image for image in images) >>> targets = [] >>> for i in range(len(images)): >>> d = {} >>> d['boxes'] = boxes[i] >>> d['labels'] = labels[i] >>> targets.append(d) >>> output = model(images, targets) >>> # For inference >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) >>> >>> # optionally, if you want to export the model to ONNX: >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11) Args: pretrained (bool): If True, returns a model pre-trained on COCO train2017 progress (bool): If True, displays a progress bar of the download to stderr num_classes (int): number of output classes of the model (including the background) pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. """ trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) if pretrained: # no need to download the backbone if pretrained is set pretrained_backbone = False backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, trainable_layers=trainable_backbone_layers) model = FasterRCNN(backbone, num_classes, **kwargs) if pretrained: state_dict = load_state_dict_from_url( model_urls['fasterrcnn_resnet50_fpn_coco'], progress=progress) model.load_state_dict(state_dict) overwrite_eps(model, 0.0) return model
def maskrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True, trainable_backbone_layers=None, **kwargs): """ Constructs a Mask R-CNN model with a ResNet-50-FPN backbone. Reference: `"Mask R-CNN" <https://arxiv.org/abs/1703.06870>`_. The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each image, and should be in ``0-1`` range. Different images can have different sizes. The behavior of the model changes depending if it is in training or evaluation mode. During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the class label for each ground-truth box - masks (``UInt8Tensor[N, H, W]``): the segmentation binary masks for each instance The model returns a ``Dict[Tensor]`` during training, containing the classification and regression losses for both the RPN and the R-CNN, and the mask loss. During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as follows, where ``N`` is the number of detected instances: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (``Int64Tensor[N]``): the predicted labels for each instance - scores (``Tensor[N]``): the scores or each instance - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (``mask >= 0.5``) For more details on the output and on how to plot the masks, you may refer to :ref:`instance_seg_output`. Mask R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size. Example:: >>> model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) >>> model.eval() >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] >>> predictions = model(x) >>> >>> # optionally, if you want to export the model to ONNX: >>> torch.onnx.export(model, x, "mask_rcnn.onnx", opset_version = 11) Args: pretrained (bool): If True, returns a model pre-trained on COCO train2017 progress (bool): If True, displays a progress bar of the download to stderr num_classes (int): number of output classes of the model (including the background) pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block. Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. """ trainable_backbone_layers = _validate_trainable_layers( pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3) if pretrained: # no need to download the backbone if pretrained is set pretrained_backbone = False backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, trainable_layers=trainable_backbone_layers) model = MaskRCNN(backbone, num_classes, **kwargs) if pretrained: state_dict = load_state_dict_from_url( model_urls['maskrcnn_resnet50_fpn_coco'], progress=progress) model.load_state_dict(state_dict) overwrite_eps(model, 0.0) return model