def test_compat_runner_args(): cfg = ConfigDict(dict(total_epochs=12)) with pytest.warns(None) as record: cfg = compat_runner_args(cfg) assert len(record) == 1 assert 'runner' in record.list[0].message.args[0] assert 'runner' in cfg assert cfg.runner.type == 'EpochBasedRunner' assert cfg.runner.max_epochs == cfg.total_epochs
def __init__(self, backbone, refiner=None, train_cfg=None, test_cfg=None, norm_cfg=None, pretrained=None): super().__init__() self.train_cfg = train_cfg if train_cfg is not None else ConfigDict() self.test_cfg = test_cfg if test_cfg is not None else ConfigDict() self.norm_cfg = norm_cfg if norm_cfg is not None else ConfigDict() self.backbone = build_backbone(backbone) # build refiner if it's not None. if refiner is None: self.train_cfg['train_refiner'] = False self.test_cfg['refine'] = False else: self.refiner = build_component(refiner) # if argument train_cfg is not None, validate if the config is proper. if train_cfg is not None: assert hasattr(self.train_cfg, 'train_refiner') assert hasattr(self.test_cfg, 'refine') if self.test_cfg.refine and not self.train_cfg.train_refiner: print_log( 'You are not training the refiner, but it is used for ' 'model forwarding.', 'root', logging.WARNING) if not self.train_cfg.train_backbone: self.freeze_backbone() # validate if test config is proper if not hasattr(self.test_cfg, 'metrics'): raise KeyError('Missing key "metrics" in test_cfg') if mmcv.is_list_of(self.test_cfg.metrics, str): for metric in self.test_cfg.metrics: if metric not in self.allowed_metrics: raise KeyError(f'metric {metric} is not supported') elif self.test_cfg.metrics is not None: raise TypeError('metrics must be None or a list of str') self.init_weights(pretrained)
def test_inference_detector(): from mmcv import ConfigDict from mmdet.apis import inference_detector from mmdet.models import build_detector # small RetinaNet num_class = 3 model_dict = dict( type='RetinaNet', backbone=dict( type='ResNet', depth=18, num_stages=4, out_indices=(3, ), norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='pytorch'), neck=None, bbox_head=dict( type='RetinaHead', num_classes=num_class, in_channels=512, stacked_convs=1, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5], strides=[32]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), ), test_cfg=dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)) rng = np.random.RandomState(0) img1 = rng.rand(100, 100, 3) img2 = rng.rand(100, 100, 3) model = build_detector(ConfigDict(model_dict)) config = _get_config_module('retinanet/retinanet_r50_fpn_1x_coco.py') model.cfg = config # test single image result = inference_detector(model, img1) assert len(result) == num_class # test multiple image result = inference_detector(model, [img1, img2]) assert len(result) == 2 and len(result[0]) == num_class
def test_centernet_head_get_bboxes(): """Tests center head generating and decoding the heatmap.""" s = 256 img_metas = [{ 'img_shape': (s, s, 3), 'scale_factor': np.array([1., 1., 1., 1.]), 'pad_shape': (s, s, 3), 'batch_input_shape': (s, s), 'border': (0, 0, 0, 0), 'flip': False }] test_cfg = ConfigDict( dict(topk=100, local_maximum_kernel=3, max_per_img=100)) gt_bboxes = [ torch.Tensor([[10, 20, 200, 240], [40, 50, 100, 200], [10, 20, 100, 240]]) ] gt_labels = [torch.LongTensor([1, 1, 2])] self = CenterNetHead(num_classes=4, in_channel=1, feat_channel=4, test_cfg=test_cfg) self.feat_shape = (1, 1, s // 4, s // 4) targets, _ = self.get_targets(gt_bboxes, gt_labels, self.feat_shape, img_metas[0]['pad_shape']) center_target = targets['center_heatmap_target'] wh_target = targets['wh_target'] offset_target = targets['offset_target'] # make sure assign target right for i in range(len(gt_bboxes[0])): bbox, label = gt_bboxes[0][i] / 4, gt_labels[0][i] ctx, cty = sum(bbox[0::2]) / 2, sum(bbox[1::2]) / 2 int_ctx, int_cty = int(sum(bbox[0::2]) / 2), int(sum(bbox[1::2]) / 2) w, h = bbox[2] - bbox[0], bbox[3] - bbox[1] x_off = ctx - int(ctx) y_off = cty - int(cty) assert center_target[0, label, int_cty, int_ctx] == 1 assert wh_target[0, 0, int_cty, int_ctx] == w assert wh_target[0, 1, int_cty, int_ctx] == h assert offset_target[0, 0, int_cty, int_ctx] == x_off assert offset_target[0, 1, int_cty, int_ctx] == y_off # make sure get_bboxes is right detections = self.get_bboxes([center_target], [wh_target], [offset_target], img_metas, rescale=True, with_nms=False) out_bboxes = detections[0][0][:3] out_clses = detections[0][1][:3] for bbox, cls in zip(out_bboxes, out_clses): flag = False for gt_bbox, gt_cls in zip(gt_bboxes[0], gt_labels[0]): if (bbox[:4] == gt_bbox[:4]).all(): flag = True assert flag, 'get_bboxes is wrong'
def test_compat_imgs_per_gpu(): cfg = ConfigDict( dict( data=dict( imgs_per_gpu=1, samples_per_gpu=2, val=dict(), test=dict(), train=dict()))) cfg = compat_imgs_per_gpu(cfg) assert cfg.data.samples_per_gpu == cfg.data.imgs_per_gpu
def test_transformer_encoder_pixel_decoder(): base_channels = 64 pixel_decoder_cfg = ConfigDict( dict(type='TransformerEncoderPixelDecoder', in_channels=[base_channels * 2**i for i in range(4)], feat_channels=base_channels, out_channels=base_channels, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict(type='MultiheadAttention', embed_dims=base_channels, num_heads=8, attn_drop=0.1, proj_drop=0.1, dropout_layer=None, batch_first=False), ffn_cfgs=dict(embed_dims=base_channels, feedforward_channels=base_channels * 8, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.1, dropout_layer=None, add_identity=True), operation_order=('self_attn', 'norm', 'ffn', 'norm'), norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False), init_cfg=None), positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True))) self = build_plugin_layer(pixel_decoder_cfg)[1] img_metas = [{ 'batch_input_shape': (128, 160), 'img_shape': (120, 160, 3), }, { 'batch_input_shape': (128, 160), 'img_shape': (125, 160, 3), }] feats = [ torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] mask_feature, memory = self(feats, img_metas) assert memory.shape[-2:] == feats[-1].shape[-2:] assert mask_feature.shape == feats[0].shape
def compat_runner_args(cfg): if 'runner' not in cfg: cfg.runner = ConfigDict({ 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs }) warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs return cfg
def test_msdeformattn_pixel_decoder(): base_channels = 64 pixel_decoder_cfg = ConfigDict( dict(type='MSDeformAttnPixelDecoder', in_channels=[base_channels * 2**i for i in range(4)], strides=[4, 8, 16, 32], feat_channels=base_channels, out_channels=base_channels, num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict(type='MultiScaleDeformableAttention', embed_dims=base_channels, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict(type='FFN', embed_dims=base_channels, feedforward_channels=base_channels * 4, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True), init_cfg=None), ) self = build_plugin_layer(pixel_decoder_cfg)[1] feats = [ torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] mask_feature, multi_scale_features = self(feats) assert mask_feature.shape == feats[0].shape assert len(multi_scale_features) == 3 multi_scale_features = multi_scale_features[::-1] for i in range(3): assert multi_scale_features[i].shape[-2:] == feats[i + 1].shape[-2:]
def test_image_classifier_return_tuple(): model_cfg = ConfigDict(type='ImageClassifier', backbone=dict(type='ResNet_CIFAR', depth=50, num_stages=4, out_indices=(3, ), style='pytorch', return_tuple=False), head=dict(type='LinearClsHead', num_classes=10, in_channels=2048, loss=dict(type='CrossEntropyLoss'))) imgs = torch.randn(16, 3, 32, 32) model_cfg_ = deepcopy(model_cfg) with pytest.warns(DeprecationWarning): model = CLASSIFIERS.build(model_cfg_) # test backbone return tensor feat = model.extract_feat(imgs) assert isinstance(feat, torch.Tensor) # test backbone return tuple model_cfg_ = deepcopy(model_cfg) model_cfg_.backbone.return_tuple = True model = CLASSIFIERS.build(model_cfg_) feat = model.extract_feat(imgs) assert isinstance(feat, tuple) # test warning if backbone return tensor class ToyBackbone(BaseModule): def __init__(self): super().__init__() self.conv = torch.nn.Conv2d(3, 16, 3) def forward(self, x): return self.conv(x) model_cfg_ = deepcopy(model_cfg) model_cfg_.backbone.return_tuple = True model = CLASSIFIERS.build(model_cfg_) model.backbone = ToyBackbone() with pytest.warns(DeprecationWarning): model.extract_feat(imgs)
def test_maskformer_fusion_head(): img_metas = [ { 'batch_input_shape': (128, 160), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3), 'pad_shape': (128, 160, 3) }, ] num_things_classes = 80 num_stuff_classes = 53 num_classes = num_things_classes + num_stuff_classes config = ConfigDict(type='MaskFormerFusionHead', num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, loss_panoptic=None, test_cfg=dict(panoptic_on=True, semantic_on=False, instance_on=True, max_per_image=100, object_mask_thr=0.8, iou_thr=0.8, filter_low_score=False), init_cfg=None) self = MaskFormerFusionHead(**config) # test forward_train assert self.forward_train() == dict() mask_cls_results = torch.rand((1, 100, num_classes + 1)) mask_pred_results = torch.rand((1, 100, 128, 160)) # test panoptic_postprocess and instance_postprocess results = self.simple_test(mask_cls_results, mask_pred_results, img_metas) assert 'ins_results' in results[0] and 'pan_results' in results[0] # test semantic_postprocess config.test_cfg.semantic_on = True with pytest.raises(AssertionError): self.simple_test(mask_cls_results, mask_pred_results, img_metas) with pytest.raises(NotImplementedError): self.semantic_postprocess(mask_cls_results, mask_pred_results)
def test_pixel_decoder(): base_channels = 64 pixel_decoder_cfg = ConfigDict( dict(type='PixelDecoder', in_channels=[base_channels * 2**i for i in range(4)], feat_channels=base_channels, out_channels=base_channels, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'))) self = build_plugin_layer(pixel_decoder_cfg)[1] img_metas = [{}, {}] feats = [ torch.rand((2, base_channels * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] mask_feature, memory = self(feats, img_metas) assert (memory == feats[-1]).all() assert mask_feature.shape == feats[0].shape
def test_maskformer_head_loss(): """Tests head loss when truth is empty and non-empty.""" base_channels = 64 # batch_input_shape = (128, 160) img_metas = [{ 'batch_input_shape': (128, 160), 'pad_shape': (128, 160, 3), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3) }, { 'batch_input_shape': (128, 160), 'pad_shape': (128, 160, 3), 'img_shape': (120, 160, 3), 'ori_shape': (60, 80, 3) }] feats = [ torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] num_things_classes = 80 num_stuff_classes = 53 num_classes = num_things_classes + num_stuff_classes config = ConfigDict( dict( type='MaskFormerHead', in_channels=[base_channels * 2**i for i in range(4)], feat_channels=base_channels, out_channels=base_channels, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, num_queries=100, pixel_decoder=dict( type='TransformerEncoderPixelDecoder', norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict(type='MultiheadAttention', embed_dims=base_channels, num_heads=8, attn_drop=0.1, proj_drop=0.1, dropout_layer=None, batch_first=False), ffn_cfgs=dict(embed_dims=base_channels, feedforward_channels=base_channels * 8, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.1, dropout_layer=None, add_identity=True), operation_order=('self_attn', 'norm', 'ffn', 'norm'), norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False), init_cfg=None), positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True)), enforce_decoder_input_project=False, positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True), transformer_decoder=dict( type='DetrTransformerDecoder', return_intermediate=True, num_layers=6, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=dict(type='MultiheadAttention', embed_dims=base_channels, num_heads=8, attn_drop=0.1, proj_drop=0.1, dropout_layer=None, batch_first=False), ffn_cfgs=dict(embed_dims=base_channels, feedforward_channels=base_channels * 8, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.1, dropout_layer=None, add_identity=True), # the following parameter was not used, # just make current api happy feedforward_channels=base_channels * 8, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), init_cfg=None), loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, reduction='mean', class_weight=[1.0] * num_classes + [0.1]), loss_mask=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=20.0), loss_dice=dict(type='DiceLoss', use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=1.0), train_cfg=dict(assigner=dict(type='MaskHungarianAssigner', cls_cost=dict( type='ClassificationCost', weight=1.0), mask_cost=dict(type='FocalLossCost', weight=20.0, binary_input=True), dice_cost=dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0)), sampler=dict(type='MaskPseudoSampler')), test_cfg=dict(object_mask_thr=0.8, iou_thr=0.8))) self = MaskFormerHead(**config) self.init_weights() all_cls_scores, all_mask_preds = self.forward(feats, img_metas) # Test that empty ground truth encourages the network to predict background gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])] gt_masks_list = [ torch.zeros((0, 128, 160)).long(), torch.zeros((0, 128, 160)).long() ] empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, gt_masks_list, img_metas) # When there is no truth, the cls loss should be nonzero but there should # be no mask loss. for key, loss in empty_gt_losses.items(): if 'cls' in key: assert loss.item() > 0, 'cls loss should be non-zero' elif 'mask' in key: assert loss.item( ) == 0, 'there should be no mask loss when there are no true mask' elif 'dice' in key: assert loss.item( ) == 0, 'there should be no dice loss when there are no true mask' # when truth is non-empty then both cls, mask, dice loss should be nonzero # random inputs gt_labels_list = [ torch.tensor([10, 100]).long(), torch.tensor([100, 10]).long() ] mask1 = torch.zeros((2, 128, 160)).long() mask1[0, :50] = 1 mask1[1, 50:] = 1 mask2 = torch.zeros((2, 128, 160)).long() mask2[0, :, :50] = 1 mask2[1, :, 50:] = 1 gt_masks_list = [mask1, mask2] two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, gt_masks_list, img_metas) for loss in two_gt_losses.values(): assert loss.item() > 0, 'all loss should be non-zero' # test forward_train gt_bboxes = None gt_labels = [ torch.tensor([10]).long(), torch.tensor([10]).long(), ] thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32) thing_mask1[0, :50] = 1 thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32) thing_mask2[0, :, 50:] = 1 gt_masks = [ BitmapMasks(thing_mask1, 128, 160), BitmapMasks(thing_mask2, 128, 160), ] stuff_mask1 = torch.zeros((1, 128, 160)).long() stuff_mask1[0, :50] = 10 stuff_mask1[0, 50:] = 100 stuff_mask2 = torch.zeros((1, 128, 160)).long() stuff_mask2[0, :, 50:] = 10 stuff_mask2[0, :, :50] = 100 gt_semantic_seg = [stuff_mask1, stuff_mask2] self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, gt_semantic_seg) # test inference mode self.simple_test(feats, img_metas)
def test_encoder_decoder(): # test 1 decode head, w.o. aux head cfg = ConfigDict(type='EncoderDecoder', backbone=dict(type='ExampleBackbone'), decode_head=dict(type='ExampleDecodeHead'), train_cfg=None, test_cfg=dict(mode='whole')) segmentor = build_segmentor(cfg) _segmentor_forward_train_test(segmentor) # test slide mode cfg.test_cfg = ConfigDict(mode='slide', crop_size=(3, 3), stride=(2, 2)) segmentor = build_segmentor(cfg) _segmentor_forward_train_test(segmentor) # test 1 decode head, 1 aux head cfg = ConfigDict(type='EncoderDecoder', backbone=dict(type='ExampleBackbone'), decode_head=dict(type='ExampleDecodeHead'), auxiliary_head=dict(type='ExampleDecodeHead')) cfg.test_cfg = ConfigDict(mode='whole') segmentor = build_segmentor(cfg) _segmentor_forward_train_test(segmentor) # test 1 decode head, 2 aux head cfg = ConfigDict(type='EncoderDecoder', backbone=dict(type='ExampleBackbone'), decode_head=dict(type='ExampleDecodeHead'), auxiliary_head=[ dict(type='ExampleDecodeHead'), dict(type='ExampleDecodeHead') ]) cfg.test_cfg = ConfigDict(mode='whole') segmentor = build_segmentor(cfg) _segmentor_forward_train_test(segmentor)
def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): deprecated_args = dict(feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ') ffn_cfgs[new_name] = kwargs[ori_name] super(BaseTransformerLayer, self).__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & set( ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index], dict(type='FFN'))) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
'assigner': { 'type': 'MaxIoUAssigner', 'pos_iou_thr': 0.7, 'neg_iou_thr': 0.3, 'min_pos_iou': 0.3, 'match_low_quality': True, 'ignore_iof_thr': -1 }, 'sampler': { 'type': 'RandomSampler', 'num': 256, 'pos_fraction': 0.5, 'neg_pos_ub': -1, 'add_gt_as_proposals': False }, 'allowed_border': -1, 'pos_weight': -1, 'debug': False }, 'test_cfg': { 'nms_across_levels': False, 'nms_pre': 1000, 'nms_post': 1000, 'max_num': 1000, 'nms_thr': 0.7, 'min_bbox_size': 0 } } head_cfgs = ConfigDict(head_cfgs) rpn_head = RPNHead(**head_cfgs)
def setup_class(cls): cls.data_prefix = osp.join(osp.dirname(osp.dirname(__file__)), 'data') cls.frame_ann_file = osp.join(cls.data_prefix, 'frame_test_list.txt') cls.frame_ann_file_with_offset = osp.join( cls.data_prefix, 'frame_test_list_with_offset.txt') cls.frame_ann_file_multi_label = osp.join( cls.data_prefix, 'frame_test_list_multi_label.txt') cls.video_ann_file = osp.join(cls.data_prefix, 'video_test_list.txt') cls.action_ann_file = osp.join(cls.data_prefix, 'action_test_anno.json') cls.proposal_ann_file = osp.join(cls.data_prefix, 'proposal_test_list.txt') cls.proposal_norm_ann_file = osp.join(cls.data_prefix, 'proposal_normalized_list.txt') cls.frame_pipeline = [ dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='RawFrameDecode', io_backend='disk') ] cls.video_pipeline = [ dict(type='OpenCVInit'), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='OpenCVDecode') ] cls.action_pipeline = [] cls.proposal_pipeline = [ dict(type='SampleProposalFrames', clip_len=1, body_segments=5, aug_segments=(2, 2), aug_ratio=0.5), dict(type='FrameSelector', io_backend='disk') ] cls.proposal_test_pipeline = [ dict(type='SampleProposalFrames', clip_len=1, body_segments=5, aug_segments=(2, 2), aug_ratio=0.5, mode='test'), dict(type='FrameSelector', io_backend='disk') ] cls.proposal_train_cfg = ConfigDict( dict(ssn=dict(assigner=dict(positive_iou_threshold=0.7, background_iou_threshold=0.01, incomplete_iou_threshold=0.5, background_coverage_threshold=0.02, incomplete_overlap_threshold=0.01), sampler=dict(num_per_video=8, positive_ratio=1, background_ratio=1, incomplete_ratio=6, add_gt_as_proposals=True), loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1), debug=False))) cls.proposal_test_cfg = ConfigDict( dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16), evaluater=dict(top_k=2000, nms=0.2, softmax_before_filter=True, cls_top_k=2)))) cls.proposal_test_cfg_topall = ConfigDict( dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16), evaluater=dict(top_k=-1, nms=0.2, softmax_before_filter=True, cls_top_k=2))))
def merge_aug_proposals(aug_proposals, img_metas, cfg): """Merge augmented proposals (multiscale, flip, etc.) Args: aug_proposals (list[Tensor]): proposals from different testing schemes, shape (n, 5). Note that they are not rescaled to the original image size. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmdet/datasets/pipelines/formatting.py:Collect`. cfg (dict): rpn test config. Returns: Tensor: shape (n, 4), proposals corresponding to original image scale. """ cfg = copy.deepcopy(cfg) # deprecate arguments warning if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg: warnings.warn( 'In rpn_proposal or test_cfg, ' 'nms_thr has been moved to a dict named nms as ' 'iou_threshold, max_num has been renamed as max_per_img, ' 'name of original arguments and the way to specify ' 'iou_threshold of NMS will be deprecated.') if 'nms' not in cfg: cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr)) if 'max_num' in cfg: if 'max_per_img' in cfg: assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \ f'max_per_img at the same time, but get {cfg.max_num} ' \ f'and {cfg.max_per_img} respectively' \ f'Please delete max_num which will be deprecated.' else: cfg.max_per_img = cfg.max_num if 'nms_thr' in cfg: assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \ f'iou_threshold in nms and ' \ f'nms_thr at the same time, but get ' \ f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \ f' respectively. Please delete the nms_thr ' \ f'which will be deprecated.' recovered_proposals = [] for proposals, img_info in zip(aug_proposals, img_metas): img_shape = img_info['img_shape'] scale_factor = img_info['scale_factor'] flip = img_info['flip'] flip_direction = img_info['flip_direction'] _proposals = proposals.clone() _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape, scale_factor, flip, flip_direction) recovered_proposals.append(_proposals) aug_proposals = torch.cat(recovered_proposals, dim=0) merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(), aug_proposals[:, -1].contiguous(), cfg.nms.iou_threshold) scores = merged_proposals[:, 4] _, order = scores.sort(0, descending=True) num = min(cfg.max_per_img, merged_proposals.shape[0]) order = order[:num] merged_proposals = merged_proposals[order, :] return merged_proposals
def setup_class(cls): # prefix path cls.data_prefix = osp.normpath( osp.join(osp.dirname(__file__), '../../data')) cls.ann_file_prefix = osp.join(cls.data_prefix, 'annotations') # annotations path cls.action_ann_file = osp.join(cls.ann_file_prefix, 'action_test_anno.json') cls.audio_feature_ann_file = osp.join(cls.ann_file_prefix, 'audio_feature_test_list.txt') cls.audio_ann_file = osp.join(cls.ann_file_prefix, 'audio_test_list.txt') cls.frame_ann_file_multi_label = osp.join( cls.ann_file_prefix, 'rawframe_test_list_multi_label.txt') cls.frame_ann_file_with_offset = osp.join( cls.ann_file_prefix, 'rawframe_test_list_with_offset.txt') cls.frame_ann_file = osp.join(cls.ann_file_prefix, 'rawframe_test_list.txt') cls.hvu_frame_ann_file = osp.join(cls.ann_file_prefix, 'hvu_frame_test_anno.json') cls.hvu_video_ann_file = osp.join(cls.ann_file_prefix, 'hvu_video_test_anno.json') cls.hvu_video_eval_ann_file = osp.join( cls.ann_file_prefix, 'hvu_video_eval_test_anno.json') cls.proposal_ann_file = osp.join(cls.ann_file_prefix, 'proposal_test_list.txt') cls.proposal_norm_ann_file = osp.join(cls.ann_file_prefix, 'proposal_normalized_list.txt') cls.rawvideo_test_anno_json = osp.join(cls.ann_file_prefix, 'rawvideo_test_anno.json') cls.rawvideo_test_anno_txt = osp.join(cls.ann_file_prefix, 'rawvideo_test_anno.txt') cls.video_ann_file = osp.join(cls.ann_file_prefix, 'video_test_list.txt') # pipeline configuration cls.action_pipeline = [] cls.audio_feature_pipeline = [ dict(type='LoadAudioFeature'), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='AudioFeatureSelector') ] cls.audio_pipeline = [ dict(type='AudioDecodeInit'), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='AudioDecode') ] cls.frame_pipeline = [ dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='RawFrameDecode', io_backend='disk') ] cls.proposal_pipeline = [ dict(type='SampleProposalFrames', clip_len=1, body_segments=5, aug_segments=(2, 2), aug_ratio=0.5), dict(type='RawFrameDecode', io_backend='disk') ] cls.proposal_test_pipeline = [ dict(type='SampleProposalFrames', clip_len=1, body_segments=5, aug_segments=(2, 2), aug_ratio=0.5, mode='test'), dict(type='RawFrameDecode', io_backend='disk') ] cls.proposal_train_cfg = ConfigDict( dict(ssn=dict(assigner=dict(positive_iou_threshold=0.7, background_iou_threshold=0.01, incomplete_iou_threshold=0.5, background_coverage_threshold=0.02, incomplete_overlap_threshold=0.01), sampler=dict(num_per_video=8, positive_ratio=1, background_ratio=1, incomplete_ratio=6, add_gt_as_proposals=True), loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1), debug=False))) cls.proposal_test_cfg = ConfigDict( dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16), evaluater=dict(top_k=2000, nms=0.2, softmax_before_filter=True, cls_top_k=2)))) cls.proposal_test_cfg_topall = ConfigDict( dict(ssn=dict(sampler=dict(test_interval=6, batch_size=16), evaluater=dict(top_k=-1, nms=0.2, softmax_before_filter=True, cls_top_k=2)))) cls.rawvideo_pipeline = [] cls.video_pipeline = [ dict(type='OpenCVInit'), dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='OpenCVDecode') ] cls.hvu_categories = [ 'action', 'attribute', 'concept', 'event', 'object', 'scene' ] cls.hvu_category_nums = [739, 117, 291, 69, 1679, 248] cls.hvu_categories_for_eval = ['action', 'scene', 'object'] cls.hvu_category_nums_for_eval = [3, 3, 3] cls.filename_tmpl = 'img_{:05d}.jpg'
def _init_model(num_stuff_classes): base_channels = 64 num_things_classes = 80 num_classes = num_things_classes + num_stuff_classes config = ConfigDict( dict( type='Mask2FormerHead', in_channels=[base_channels * 2**i for i in range(4)], feat_channels=base_channels, out_channels=base_channels, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, num_queries=100, num_transformer_feat_level=3, pixel_decoder=dict( type='MSDeformAttnPixelDecoder', num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict(type='MultiScaleDeformableAttention', embed_dims=base_channels, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict(type='FFN', embed_dims=base_channels, feedforward_channels=base_channels * 4, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True)), feedforward_channels=base_channels * 4, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True), init_cfg=None), enforce_decoder_input_project=False, positional_encoding=dict(type='SinePositionalEncoding', num_feats=base_channels // 2, normalize=True), transformer_decoder=dict( type='DetrTransformerDecoder', return_intermediate=True, num_layers=9, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=dict(type='MultiheadAttention', embed_dims=base_channels, num_heads=8, attn_drop=0.0, proj_drop=0.0, dropout_layer=None, batch_first=False), ffn_cfgs=dict(embed_dims=base_channels, feedforward_channels=base_channels * 8, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.0, dropout_layer=None, add_identity=True), # the following parameter was not used, # just make current api happy feedforward_channels=base_channels * 8, operation_order=('cross_attn', 'norm', 'self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0, reduction='mean', class_weight=[1.0] * num_classes + [0.1]), loss_mask=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=5.0), loss_dice=dict(type='DiceLoss', use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=5.0), train_cfg=dict( num_points=256, oversample_ratio=3.0, importance_sample_ratio=0.75, assigner=dict(type='MaskHungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=2.0), mask_cost=dict(type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), dice_cost=dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), sampler=dict(type='MaskPseudoSampler')), test_cfg=dict(panoptic_on=True, semantic_on=False, instance_on=True, max_dets_per_image=100, object_mask_thr=0.8, iou_thr=0.8))) self = Mask2FormerHead(**config) self.init_weights() return self
def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_anchors, mlvl_masks, img_shape, scale_factor, cfg, rescale=False): cfg = self.test_cfg if cfg is None else cfg cfg = copy.deepcopy(cfg) # deprecate arguments warning if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg: warnings.warn( 'In rpn_proposal or test_cfg, ' 'nms_thr has been moved to a dict named nms as ' 'iou_threshold, max_num has been renamed as max_per_img, ' 'name of original arguments and the way to specify ' 'iou_threshold of NMS will be deprecated.') if 'nms' not in cfg: cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr)) if 'max_num' in cfg: if 'max_per_img' in cfg: assert cfg.max_num == cfg.max_per_img, f'You ' \ f'set max_num and max_per_img at the same time, ' \ f'but get {cfg.max_num} ' \ f'and {cfg.max_per_img} respectively' \ 'Please delete max_num which will be deprecated.' else: cfg.max_per_img = cfg.max_num if 'nms_thr' in cfg: assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \ f'iou_threshold in nms and ' \ f'nms_thr at the same time, but get ' \ f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \ f' respectively. Please delete the ' \ f'nms_thr which will be deprecated.' assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \ 'naive nms.' mlvl_proposals = [] for idx in range(len(cls_scores)): rpn_cls_score = cls_scores[idx] rpn_bbox_pred = bbox_preds[idx] anchors = mlvl_anchors[idx] mask = mlvl_masks[idx] assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] # if no location is kept, end. if mask.sum() == 0: continue rpn_cls_score = rpn_cls_score.permute(1, 2, 0) if self.use_sigmoid_cls: rpn_cls_score = rpn_cls_score.reshape(-1) scores = rpn_cls_score.sigmoid() else: rpn_cls_score = rpn_cls_score.reshape(-1, 2) # remind that we set FG labels to [0, num_class-1] # since mmdet v2.0 # BG cat_id: num_class scores = rpn_cls_score.softmax(dim=1)[:, :-1] # filter scores, bbox_pred w.r.t. mask. # anchors are filtered in get_anchors() beforehand. scores = scores[mask] rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)[mask, :] if scores.dim() == 0: rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0) anchors = anchors.unsqueeze(0) scores = scores.unsqueeze(0) # filter anchors, bbox_pred, scores w.r.t. scores if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: _, topk_inds = scores.topk(cfg.nms_pre) rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] anchors = anchors[topk_inds, :] scores = scores[topk_inds] # get proposals w.r.t. anchors and rpn_bbox_pred proposals = self.bbox_coder.decode( anchors, rpn_bbox_pred, max_shape=img_shape) # filter out too small bboxes if cfg.min_bbox_size >= 0: w = proposals[:, 2] - proposals[:, 0] h = proposals[:, 3] - proposals[:, 1] valid_inds = torch.nonzero( (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size), as_tuple=False).squeeze() proposals = proposals[valid_inds, :] scores = scores[valid_inds] # NMS in current level proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold) proposals = proposals[:cfg.nms_post, :] mlvl_proposals.append(proposals) proposals = torch.cat(mlvl_proposals, 0) if cfg.get('nms_across_levels', False): # NMS across multi levels proposals, _ = nms(proposals[:, :4], proposals[:, -1], cfg.nms.iou_threshold) proposals = proposals[:cfg.max_per_img, :] else: scores = proposals[:, 4] num = min(cfg.max_per_img, proposals.shape[0]) _, topk_inds = scores.topk(num) proposals = proposals[topk_inds, :] return proposals
def __init__(self, num_frames, img_size, patch_size, pretrained=None, embed_dims=768, num_heads=12, num_transformer_layers=12, in_channels=3, dropout_ratio=0., transformer_layers=None, attention_type='divided_space_time', norm_cfg=dict(type='LN', eps=1e-6), **kwargs): super().__init__(**kwargs) assert attention_type in self.supported_attention_types, ( f'Unsupported Attention Type {attention_type}!') assert transformer_layers is None or isinstance( transformer_layers, (dict, list)) self.num_frames = num_frames self.pretrained = pretrained self.embed_dims = embed_dims self.num_transformer_layers = num_transformer_layers self.attention_type = attention_type self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dims=embed_dims) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self.pos_embed = nn.Parameter( torch.zeros(1, num_patches + 1, embed_dims)) self.drop_after_pos = nn.Dropout(p=dropout_ratio) if self.attention_type != 'space_only': self.time_embed = nn.Parameter( torch.zeros(1, num_frames, embed_dims)) self.drop_after_time = nn.Dropout(p=dropout_ratio) self.norm = build_norm_layer(norm_cfg, embed_dims)[1] if transformer_layers is None: # stochastic depth decay rule dpr = np.linspace(0, 0.1, num_transformer_layers) if self.attention_type == 'divided_space_time': _transformerlayers_cfg = [ dict( type='BaseTransformerLayer', attn_cfgs=[ dict( type='DividedTemporalAttentionWithNorm', embed_dims=embed_dims, num_heads=num_heads, num_frames=num_frames, dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)), dict( type='DividedSpatialAttentionWithNorm', embed_dims=embed_dims, num_heads=num_heads, num_frames=num_frames, dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)) ], ffn_cfgs=dict( type='FFNWithNorm', embed_dims=embed_dims, feedforward_channels=embed_dims * 4, num_fcs=2, act_cfg=dict(type='GELU'), dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)), operation_order=('self_attn', 'self_attn', 'ffn')) for i in range(num_transformer_layers) ] else: # Sapce Only & Joint Space Time _transformerlayers_cfg = [ dict( type='BaseTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=num_heads, batch_first=True, dropout_layer=dict( type='DropPath', drop_prob=dpr[i])) ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims * 4, num_fcs=2, act_cfg=dict(type='GELU'), dropout_layer=dict( type='DropPath', drop_prob=dpr[i])), operation_order=('norm', 'self_attn', 'norm', 'ffn'), norm_cfg=dict(type='LN', eps=1e-6), batch_first=True) for i in range(num_transformer_layers) ] transformer_layers = ConfigDict( dict( type='TransformerLayerSequence', transformerlayers=_transformerlayers_cfg, num_layers=num_transformer_layers)) self.transformer_layers = build_transformer_layer_sequence( transformer_layers)
def _get_bboxes_single(self, cls_scores, bbox_preds, mlvl_anchors, img_shape, scale_factor, cfg, rescale=False): """Transform outputs for a single batch item into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (num_anchors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (num_anchors * 4, H, W). mlvl_anchors (list[Tensor]): Box reference for each scale level with shape (num_total_anchors, 4). img_shape (tuple[int]): Shape of the input image, (height, width, 3). scale_factor (ndarray): Scale factor of the image arange as (w_scale, h_scale, w_scale, h_scale). cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Returns: Tensor: Labeled boxes have the shape of (n,5), where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. """ cfg = self.test_cfg if cfg is None else cfg cfg = copy.deepcopy(cfg) # bboxes from different level should be independent during NMS, # level_ids are used as labels for batched NMS to separate them level_ids = [] mlvl_scores = [] mlvl_bbox_preds = [] mlvl_valid_anchors = [] for idx in range(len(cls_scores)): rpn_cls_score = cls_scores[idx] rpn_bbox_pred = bbox_preds[idx] assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] rpn_cls_score = rpn_cls_score.permute(1, 2, 0) if self.use_sigmoid_cls: rpn_cls_score = rpn_cls_score.reshape(-1) scores = rpn_cls_score.sigmoid() else: rpn_cls_score = rpn_cls_score.reshape(-1, 2) # We set FG labels to [0, num_class-1] and BG label to # num_class in RPN head since mmdet v2.5, which is unified to # be consistent with other head since mmdet v2.0. In mmdet v2.0 # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head. scores = rpn_cls_score.softmax(dim=1)[:, 0] rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4) anchors = mlvl_anchors[idx] if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: # sort is faster than topk # _, topk_inds = scores.topk(cfg.nms_pre) if torch.onnx.is_in_onnx_export(): # sort op will be converted to TopK in onnx # and k<=3480 in TensorRT _, topk_inds = scores.topk(cfg.nms_pre) scores = scores[topk_inds] else: ranked_scores, rank_inds = scores.sort(descending=True) topk_inds = rank_inds[:cfg.nms_pre] scores = ranked_scores[:cfg.nms_pre] rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] anchors = anchors[topk_inds, :] mlvl_scores.append(scores) mlvl_bbox_preds.append(rpn_bbox_pred) mlvl_valid_anchors.append(anchors) level_ids.append( scores.new_full((scores.size(0), ), idx, dtype=torch.long)) scores = torch.cat(mlvl_scores) anchors = torch.cat(mlvl_valid_anchors) rpn_bbox_pred = torch.cat(mlvl_bbox_preds) proposals = self.bbox_coder.decode(anchors, rpn_bbox_pred, max_shape=img_shape) ids = torch.cat(level_ids) # Skip nonzero op while exporting to ONNX if cfg.min_bbox_size > 0 and (not torch.onnx.is_in_onnx_export()): w = proposals[:, 2] - proposals[:, 0] h = proposals[:, 3] - proposals[:, 1] valid_inds = torch.nonzero((w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size), as_tuple=False).squeeze() if valid_inds.sum().item() != len(proposals): proposals = proposals[valid_inds, :] scores = scores[valid_inds] ids = ids[valid_inds] # deprecate arguments warning if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg: warnings.warn( 'In rpn_proposal or test_cfg, ' 'nms_thr has been moved to a dict named nms as ' 'iou_threshold, max_num has been renamed as max_per_img, ' 'name of original arguments and the way to specify ' 'iou_threshold of NMS will be deprecated.') if 'nms' not in cfg: cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr)) if 'max_num' in cfg: if 'max_per_img' in cfg: assert cfg.max_num == cfg.max_per_img, f'You ' \ f'set max_num and ' \ f'max_per_img at the same time, but get {cfg.max_num} ' \ f'and {cfg.max_per_img} respectively' \ 'Please delete max_num which will be deprecated.' else: cfg.max_per_img = cfg.max_num if 'nms_thr' in cfg: assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set' \ f' iou_threshold in nms and ' \ f'nms_thr at the same time, but get' \ f' {cfg.nms.iou_threshold} and {cfg.nms_thr}' \ f' respectively. Please delete the nms_thr ' \ f'which will be deprecated.' dets, keep = batched_nms(proposals, scores, ids, cfg.nms) return dets[:cfg.max_per_img]
def _get_bboxes(self, cls_scores, bbox_preds, mlvl_anchors, img_shapes, scale_factors, cfg, rescale=False): """Transform outputs for a single batch item into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W). mlvl_anchors (list[Tensor]): Box reference for each scale level with shape (num_total_anchors, 4). img_shapes (list[tuple[int]]): Shape of the input image, (height, width, 3). scale_factors (list[ndarray]): Scale factor of the image arange as (w_scale, h_scale, w_scale, h_scale). cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Returns: list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. The first item is an (n, 5) tensor, where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. The second item is a (n,) tensor where each item is the predicted class labelof the corresponding box. """ cfg = self.test_cfg if cfg is None else cfg cfg = copy.deepcopy(cfg) # bboxes from different level should be independent during NMS, # level_ids are used as labels for batched NMS to separate them level_ids = [] mlvl_scores = [] mlvl_bbox_preds = [] mlvl_valid_anchors = [] batch_size = cls_scores[0].shape[0] nms_pre_tensor = torch.tensor(cfg.nms_pre, device=cls_scores[0].device, dtype=torch.long) for idx in range(len(cls_scores)): rpn_cls_score = cls_scores[idx] rpn_bbox_pred = bbox_preds[idx] assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1) if self.use_sigmoid_cls: rpn_cls_score = rpn_cls_score.reshape(batch_size, -1) scores = rpn_cls_score.sigmoid() else: rpn_cls_score = rpn_cls_score.reshape(batch_size, -1, 2) # We set FG labels to [0, num_class-1] and BG label to # num_class in RPN head since mmdet v2.5, which is unified to # be consistent with other head since mmdet v2.0. In mmdet v2.0 # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head. scores = rpn_cls_score.softmax(-1)[..., 0] rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).reshape( batch_size, -1, 4) anchors = mlvl_anchors[idx] anchors = anchors.expand_as(rpn_bbox_pred) # Get top-k prediction from mmdet.core.export import get_k_for_topk nms_pre = get_k_for_topk(nms_pre_tensor, rpn_bbox_pred.shape[1]) if nms_pre > 0: _, topk_inds = scores.topk(nms_pre) batch_inds = torch.arange(batch_size).view( -1, 1).expand_as(topk_inds) # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501 if torch.onnx.is_in_onnx_export(): # Mind k<=3480 in TensorRT for TopK transformed_inds = scores.shape[1] * batch_inds + topk_inds scores = scores.reshape(-1, 1)[transformed_inds].reshape( batch_size, -1) rpn_bbox_pred = rpn_bbox_pred.reshape( -1, 4)[transformed_inds, :].reshape(batch_size, -1, 4) anchors = anchors.reshape(-1, 4)[transformed_inds, :].reshape( batch_size, -1, 4) else: # sort is faster than topk ranked_scores, rank_inds = scores.sort(descending=True) topk_inds = rank_inds[:, :cfg.nms_pre] scores = ranked_scores[:, :cfg.nms_pre] batch_inds = torch.arange(batch_size).view( -1, 1).expand_as(topk_inds) rpn_bbox_pred = rpn_bbox_pred[batch_inds, topk_inds, :] anchors = anchors[batch_inds, topk_inds, :] mlvl_scores.append(scores) mlvl_bbox_preds.append(rpn_bbox_pred) mlvl_valid_anchors.append(anchors) level_ids.append( scores.new_full(( batch_size, scores.size(1), ), idx, dtype=torch.long)) batch_mlvl_scores = torch.cat(mlvl_scores, dim=1) batch_mlvl_anchors = torch.cat(mlvl_valid_anchors, dim=1) batch_mlvl_rpn_bbox_pred = torch.cat(mlvl_bbox_preds, dim=1) batch_mlvl_proposals = self.bbox_coder.decode(batch_mlvl_anchors, batch_mlvl_rpn_bbox_pred, max_shape=img_shapes) batch_mlvl_ids = torch.cat(level_ids, dim=1) # deprecate arguments warning if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg: warnings.warn( 'In rpn_proposal or test_cfg, ' 'nms_thr has been moved to a dict named nms as ' 'iou_threshold, max_num has been renamed as max_per_img, ' 'name of original arguments and the way to specify ' 'iou_threshold of NMS will be deprecated.') if 'nms' not in cfg: cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr)) if 'max_num' in cfg: if 'max_per_img' in cfg: assert cfg.max_num == cfg.max_per_img, f'You ' \ f'set max_num and ' \ f'max_per_img at the same time, but get {cfg.max_num} ' \ f'and {cfg.max_per_img} respectively' \ 'Please delete max_num which will be deprecated.' else: cfg.max_per_img = cfg.max_num if 'nms_thr' in cfg: assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set' \ f' iou_threshold in nms and ' \ f'nms_thr at the same time, but get' \ f' {cfg.nms.iou_threshold} and {cfg.nms_thr}' \ f' respectively. Please delete the nms_thr ' \ f'which will be deprecated.' # Replace batched_nms with ONNX::NonMaxSuppression in deployment if torch.onnx.is_in_onnx_export(): from mmdet.core.export import add_dummy_nms_for_onnx batch_mlvl_scores = batch_mlvl_scores.unsqueeze(2) score_threshold = cfg.nms.get('score_thr', 0.0) nms_pre = cfg.get('deploy_nms_pre', cfg.max_per_img) dets, _ = add_dummy_nms_for_onnx(batch_mlvl_proposals, batch_mlvl_scores, cfg.max_per_img, cfg.nms.iou_threshold, score_threshold, nms_pre, cfg.max_per_img) return dets result_list = [] for (mlvl_proposals, mlvl_scores, mlvl_ids) in zip(batch_mlvl_proposals, batch_mlvl_scores, batch_mlvl_ids): # Skip nonzero op while exporting to ONNX if cfg.min_bbox_size > 0 and (not torch.onnx.is_in_onnx_export()): w = mlvl_proposals[:, 2] - mlvl_proposals[:, 0] h = mlvl_proposals[:, 3] - mlvl_proposals[:, 1] valid_ind = torch.nonzero((w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size), as_tuple=False).squeeze() if valid_ind.sum().item() != len(mlvl_proposals): mlvl_proposals = mlvl_proposals[valid_ind, :] mlvl_scores = mlvl_scores[valid_ind] mlvl_ids = mlvl_ids[valid_ind] dets, keep = batched_nms(mlvl_proposals, mlvl_scores, mlvl_ids, cfg.nms) result_list.append(dets[:cfg.max_per_img]) return result_list
def compat_loader_args(cfg): """Deprecated sample_per_gpu in cfg.data.""" cfg = copy.deepcopy(cfg) if 'train_dataloader' not in cfg.data: cfg.data['train_dataloader'] = ConfigDict() if 'val_dataloader' not in cfg.data: cfg.data['val_dataloader'] = ConfigDict() if 'test_dataloader' not in cfg.data: cfg.data['test_dataloader'] = ConfigDict() # special process for train_dataloader if 'samples_per_gpu' in cfg.data: samples_per_gpu = cfg.data.pop('samples_per_gpu') assert 'samples_per_gpu' not in \ cfg.data.train_dataloader, ('`samples_per_gpu` are set ' 'in `data` field and ` ' 'data.train_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.train_dataloader`. ') cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu if 'persistent_workers' in cfg.data: persistent_workers = cfg.data.pop('persistent_workers') assert 'persistent_workers' not in \ cfg.data.train_dataloader, ('`persistent_workers` are set ' 'in `data` field and ` ' 'data.train_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.train_dataloader`. ') cfg.data.train_dataloader['persistent_workers'] = persistent_workers if 'workers_per_gpu' in cfg.data: workers_per_gpu = cfg.data.pop('workers_per_gpu') cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu # special process for val_dataloader if 'samples_per_gpu' in cfg.data.val: # keep default value of `sample_per_gpu` is 1 assert 'samples_per_gpu' not in \ cfg.data.val_dataloader, ('`samples_per_gpu` are set ' 'in `data.val` field and ` ' 'data.val_dataloader` at ' 'the same time. ' 'Please only set it in ' '`data.val_dataloader`. ') cfg.data.val_dataloader['samples_per_gpu'] = \ cfg.data.val.pop('samples_per_gpu') # special process for val_dataloader # in case the test dataset is concatenated if isinstance(cfg.data.test, dict): if 'samples_per_gpu' in cfg.data.test: assert 'samples_per_gpu' not in \ cfg.data.test_dataloader, ('`samples_per_gpu` are set ' 'in `data.test` field and ` ' 'data.test_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.test_dataloader`. ') cfg.data.test_dataloader['samples_per_gpu'] = \ cfg.data.test.pop('samples_per_gpu') elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: if 'samples_per_gpu' in ds_cfg: assert 'samples_per_gpu' not in \ cfg.data.test_dataloader, ('`samples_per_gpu` are set ' 'in `data.test` field and ` ' 'data.test_dataloader` at' ' the same time. ' 'Please only set it in ' '`data.test_dataloader`. ') samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu return cfg
return losses else: return ds return ds if __name__ == "__main__": import time from mmcv import ConfigDict cfg = dict(pretrained_model='/home/caojia/densenet161.pth', model=dict(depth_num_layers=161, input_shape=[608, 960], max_depth=80, fxy=[631.0]), data=dict(imgs_per_gpu=2)) cfg = ConfigDict(cfg) net = LPGNet(cfg).cuda().eval() x = torch.randn((2, 3, 608, 960)).cuda() focal = [712.] * x.size()[ 0] # camera focal_fxy, the length should be equal to input batch_size inputs = dict(leftImage=x, left_gt=x[:, 0, :, :]) torch.cuda.synchronize() s_t = time.time() y = net(inputs) torch.cuda.synchronize() print('inference time is ', time.time() - s_t)
def test_ssn_loss(): ssn_loss = SSNLoss() # test activity_loss activity_score = torch.rand((8, 21)) labels = torch.LongTensor([8] * 8).squeeze() activity_indexer = torch.tensor([0, 7]) output_activity_loss = ssn_loss.activity_loss(activity_score, labels, activity_indexer) assert torch.equal( output_activity_loss, F.cross_entropy(activity_score[activity_indexer, :], labels[activity_indexer])) # test completeness_loss completeness_score = torch.rand((8, 20), requires_grad=True) labels = torch.LongTensor([8] * 8).squeeze() completeness_indexer = torch.tensor([0, 1, 2, 3, 4, 5, 6]) positive_per_video = 1 incomplete_per_video = 6 output_completeness_loss = ssn_loss.completeness_loss( completeness_score, labels, completeness_indexer, positive_per_video, incomplete_per_video) pred = completeness_score[completeness_indexer, :] gt = labels[completeness_indexer] pred_dim = pred.size(1) pred = pred.view(-1, positive_per_video + incomplete_per_video, pred_dim) gt = gt.view(-1, positive_per_video + incomplete_per_video) # yapf:disable positive_pred = pred[:, :positive_per_video, :].contiguous().view(-1, pred_dim) # noqa:E501 incomplete_pred = pred[:, positive_per_video:, :].contiguous().view(-1, pred_dim) # noqa:E501 # yapf:enable ohem_ratio = 0.17 positive_loss = OHEMHingeLoss.apply( positive_pred, gt[:, :positive_per_video].contiguous().view(-1), 1, 1.0, positive_per_video) incomplete_loss = OHEMHingeLoss.apply( incomplete_pred, gt[:, positive_per_video:].contiguous().view(-1), -1, ohem_ratio, incomplete_per_video) num_positives = positive_pred.size(0) num_incompletes = int(incomplete_pred.size(0) * ohem_ratio) assert_loss = ((positive_loss + incomplete_loss) / float(num_positives + num_incompletes)) assert torch.equal(output_completeness_loss, assert_loss) # test reg_loss bbox_pred = torch.rand((8, 20, 2)) labels = torch.LongTensor([8] * 8).squeeze() bbox_targets = torch.rand((8, 2)) regression_indexer = torch.tensor([0]) output_reg_loss = ssn_loss.classwise_regression_loss( bbox_pred, labels, bbox_targets, regression_indexer) pred = bbox_pred[regression_indexer, :, :] gt = labels[regression_indexer] reg_target = bbox_targets[regression_indexer, :] class_idx = gt.data - 1 classwise_pred = pred[:, class_idx, :] classwise_reg_pred = torch.cat((torch.diag(classwise_pred[:, :, 0]).view( -1, 1), torch.diag(classwise_pred[:, :, 1]).view(-1, 1)), dim=1) assert torch.equal( output_reg_loss, F.smooth_l1_loss(classwise_reg_pred.view(-1), reg_target.view(-1)) * 2) # test ssn_loss proposal_type = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 2]]) train_cfg = ConfigDict( dict(ssn=dict(sampler=dict(num_per_video=8, positive_ratio=1, background_ratio=1, incomplete_ratio=6, add_gt_as_proposals=True), loss_weight=dict(comp_loss_weight=0.1, reg_loss_weight=0.1)))) output_loss = ssn_loss(activity_score, completeness_score, bbox_pred, proposal_type, labels, bbox_targets, train_cfg) assert torch.equal(output_loss['loss_activity'], output_activity_loss) assert torch.equal(output_loss['loss_completeness'], output_completeness_loss * 0.1) assert torch.equal(output_loss['loss_reg'], output_reg_loss * 0.1)
"abs_rel": abs_rel / len(img_files), "sq_rel": sq_rel / len(img_files), "rmse": rmse / len(img_files), "rmse_log": rmse_log / len(img_files), "delta1": delta1 / len(img_files), "delta2": delta2 / len(img_files), "delta3": delta3 / len(img_files) } if __name__ == "__main__": from mmcv import ConfigDict import matplotlib.pyplot as plt infer_cfg = dict(model_path='./tmp/epoch_16.pth', pretrained_model='/home/caojia/densenet161.pth', data=dict(output_size=(352, 1216), imgs_per_gpu=1), model=dict(type='LPGNet', depth_num_layers=161, input_shape=[352, 1216], max_depth=80, fxy=[721.0])) infer_cfg = ConfigDict(infer_cfg) evaluator = Evaluator(infer_cfg) img_folder = '/home/caojia/kitti_eigen_test/image_02/' gt_folder = '/home/caojia/kitti_eigen_test/groundtruth/' print(evaluator.eval(img_folder, gt_folder, False))
def test_compat_loader_args(): cfg = ConfigDict(dict(data=dict(val=dict(), test=dict(), train=dict()))) cfg = compat_loader_args(cfg) # auto fill loader args assert 'val_dataloader' in cfg.data assert 'train_dataloader' in cfg.data assert 'test_dataloader' in cfg.data cfg = ConfigDict( dict( data=dict( samples_per_gpu=1, persistent_workers=True, workers_per_gpu=1, val=dict(samples_per_gpu=3), test=dict(samples_per_gpu=2), train=dict()))) cfg = compat_loader_args(cfg) assert cfg.data.train_dataloader.workers_per_gpu == 1 assert cfg.data.train_dataloader.samples_per_gpu == 1 assert cfg.data.train_dataloader.persistent_workers assert cfg.data.val_dataloader.workers_per_gpu == 1 assert cfg.data.val_dataloader.samples_per_gpu == 3 assert cfg.data.test_dataloader.workers_per_gpu == 1 assert cfg.data.test_dataloader.samples_per_gpu == 2 # test test is a list cfg = ConfigDict( dict( data=dict( samples_per_gpu=1, persistent_workers=True, workers_per_gpu=1, val=dict(samples_per_gpu=3), test=[dict(samples_per_gpu=2), dict(samples_per_gpu=3)], train=dict()))) cfg = compat_loader_args(cfg) assert cfg.data.test_dataloader.samples_per_gpu == 3 # assert can not set args at the same time cfg = ConfigDict( dict( data=dict( samples_per_gpu=1, persistent_workers=True, workers_per_gpu=1, val=dict(samples_per_gpu=3), test=dict(samples_per_gpu=2), train=dict(), train_dataloader=dict(samples_per_gpu=2)))) # samples_per_gpu can not be set in `train_dataloader` # and data field at the same time with pytest.raises(AssertionError): compat_loader_args(cfg) cfg = ConfigDict( dict( data=dict( samples_per_gpu=1, persistent_workers=True, workers_per_gpu=1, val=dict(samples_per_gpu=3), test=dict(samples_per_gpu=2), train=dict(), val_dataloader=dict(samples_per_gpu=2)))) # samples_per_gpu can not be set in `val_dataloader` # and data field at the same time with pytest.raises(AssertionError): compat_loader_args(cfg) cfg = ConfigDict( dict( data=dict( samples_per_gpu=1, persistent_workers=True, workers_per_gpu=1, val=dict(samples_per_gpu=3), test=dict(samples_per_gpu=2), test_dataloader=dict(samples_per_gpu=2)))) # samples_per_gpu can not be set in `test_dataloader` # and data field at the same time with pytest.raises(AssertionError): compat_loader_args(cfg)
# ========================================= # # ============== BUILD MODEL ================ # class_map = icedata.coco.class_map() model_name = "mobilenetv3_large_100_aa" base_config_path = mmdet_configs_path / "retinanet" config_path = base_config_path / "retinanet_r50_fpn_1x_coco.py" cfg = Config.fromfile(config_path) ## mmdet >= 2.12 requires `ConfigDict`, not just `dict` cfg.model.backbone = ConfigDict( dict( type=f"TIMM_{model_name}", pretrained=True, out_indices=(1, 2, 3, 4), )) cfg.model.neck.in_channels = [24, 40, 112, 960] cfg.model.bbox_head.num_classes = len(class_map) - 1 model = build_detector(cfg.model) # print(model) # ============================================ # # ============== PL LIGHTNING ADAPTER ================ # class MobileNetV3Adapter(models.mmdet.retinanet.lightning.ModelAdapter): def __init__(
def test_detr_head_loss(): """Tests transformer head loss when truth is empty and non-empty.""" s = 256 img_metas = [{ 'img_shape': (s, s, 3), 'scale_factor': 1, 'pad_shape': (s, s, 3), 'batch_input_shape': (s, s) }] config = ConfigDict( dict(type='DETRHead', num_classes=80, in_channels=200, transformer=dict( type='Transformer', encoder=dict(type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=[ dict(type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1) ], feedforward_channels=2048, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'ffn', 'norm'))), decoder=dict( type='DetrTransformerDecoder', return_intermediate=True, num_layers=6, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=dict(type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1), feedforward_channels=2048, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), loss_cls=dict(type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0))) self = DETRHead(**config) self.init_weights() feat = [torch.rand(1, 200, 10, 10)] cls_scores, bbox_preds = self.forward(feat, img_metas) # Test that empty ground truth encourages the network to predict background gt_bboxes = [torch.empty((0, 4))] gt_labels = [torch.LongTensor([])] gt_bboxes_ignore = None empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore) # When there is no truth, the cls loss should be nonzero but there should # be no box loss. for key, loss in empty_gt_losses.items(): if 'cls' in key: assert loss.item() > 0, 'cls loss should be non-zero' elif 'bbox' in key: assert loss.item( ) == 0, 'there should be no box loss when there are no true boxes' elif 'iou' in key: assert loss.item( ) == 0, 'there should be no iou loss when there are no true boxes' # When truth is non-empty then both cls and box loss should be nonzero for # random inputs gt_bboxes = [ torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), ] gt_labels = [torch.LongTensor([2])] one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore) for loss in one_gt_losses.values(): assert loss.item( ) > 0, 'cls loss, or box loss, or iou loss should be non-zero' # test forward_train self.forward_train(feat, img_metas, gt_bboxes, gt_labels) # test inference mode self.get_bboxes(cls_scores, bbox_preds, img_metas, rescale=True)