Пример #1
0
    def __init__(
        self,
        box_roi_pool,
        box_head,
        box_predictor,
        # Faster R-CNN training
        fg_iou_thresh,
        bg_iou_thresh,
        batch_size_per_image,
        positive_fraction,
        bbox_reg_weights,
        # Faster R-CNN inference
        score_thresh,
        nms_thresh,
        detections_per_img,
        out_channels,
        # Mask
        mask_roi_pool=None,
        mask_head=None,
        mask_predictor=None,
        keypoint_roi_pool=None,
        keypoint_head=None,
        keypoint_predictor=None,
        pose_mean=None,
        pose_stddev=None,
        threed_68_points=None,
        threed_5_points=None,
        bbox_x_factor=1.1,
        bbox_y_factor=1.1,
        expand_forehead=0.3,
    ):
        super(RoIHeads, self).__init__()

        self.box_similarity = box_ops.box_iou
        # assign ground-truth boxes for each proposal
        self.proposal_matcher = det_utils.Matcher(
            fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
            batch_size_per_image, positive_fraction)

        if bbox_reg_weights is None:
            bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)

        self.box_roi_pool = box_roi_pool
        self.box_head = box_head
        self.box_predictor = box_predictor

        num_classes = 2
        self.class_roi_pool = MultiScaleRoIAlign(
            featmap_names=["0", "1", "2", "3"],
            output_size=7,
            sampling_ratio=2)
        resolution = box_roi_pool.output_size[0]
        representation_size = 1024
        self.class_head = TwoMLPHead(out_channels * resolution**2,
                                     representation_size)
        self.class_predictor = FastRCNNClassPredictor(representation_size,
                                                      num_classes)
        self.score_thresh = score_thresh
        self.nms_thresh = nms_thresh
        self.detections_per_img = detections_per_img
        self.mask_roi_pool = mask_roi_pool
        self.mask_head = mask_head
        self.mask_predictor = mask_predictor

        self.keypoint_roi_pool = keypoint_roi_pool
        self.keypoint_head = keypoint_head
        self.keypoint_predictor = keypoint_predictor

        self.pose_mean = pose_mean
        self.pose_stddev = pose_stddev
        self.threed_68_points = threed_68_points
        self.threed_5_points = threed_5_points

        self.bbox_x_factor = bbox_x_factor
        self.bbox_y_factor = bbox_y_factor
        self.expand_forehead = expand_forehead
Пример #2
0
    def __init__(
        self,
        correlation_args,
        batch_norm,
        conv_channels,
        n_box_channels,
        roi_output_size,
        avg_box_features,
        hidden_size,
        input_length,
        n_layers,
        dropout,
        correlation_only,
        use_env_features,
        fixed_env,
        correlation_last_only,
        sum_lstm_layers,
        max_box_features=False,
        use_pre_conv=False,
    ):
        super().__init__()

        self.correlation_args = correlation_args
        self.batch_norm = batch_norm
        self.conv_channels = conv_channels
        self.n_box_channels = n_box_channels
        self.roi_output_size = roi_output_size
        self.avg_box_features = avg_box_features
        self.hidden_size = hidden_size
        self.input_length = input_length
        self.n_layers = n_layers
        self.output_size = 6
        self.dropout = dropout
        self.correlation_only = correlation_only
        self.use_env_features = use_env_features
        self.fixed_env = fixed_env
        self.correlation_last_only = correlation_last_only
        self.sum_lstm_layers = sum_lstm_layers
        self.max_box_features = max_box_features

        locations_per_box = 1 if (
            self.avg_box_features
            or self.max_box_features) else roi_output_size**2
        multiplier = 2 if self.use_env_features else 1
        self.input_size = 6 + (n_box_channels * locations_per_box * multiplier)

        self.roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                           output_size=roi_output_size,
                                           sampling_ratio=2)

        # layers inspired from https://github.com/ClementPinard/FlowNetPytorch/blob/master/models/FlowNetC.py
        self.conv_redir = conv(self.batch_norm,
                               256,
                               32,
                               kernel_size=1,
                               stride=1)
        in_planes = (self.correlation_args['patch_size']**
                     2) + (0 if self.correlation_only else 32)
        self.conv3_1 = conv(self.batch_norm, in_planes, self.conv_channels)
        self.conv4 = conv(self.batch_norm, self.conv_channels,
                          self.conv_channels)
        self.conv4_1 = conv(self.batch_norm, self.conv_channels,
                            self.n_box_channels)

        # recurrent layers
        self.encoder = nn.LSTM(self.input_size,
                               self.hidden_size,
                               batch_first=True,
                               num_layers=n_layers,
                               dropout=dropout)
        self.attn = nn.Linear(self.hidden_size + self.input_size,
                              self.input_length)
        self.attn_combine = nn.Linear(self.hidden_size + self.output_size,
                                      self.hidden_size)
        self.decoder = nn.LSTM(self.input_size,
                               self.hidden_size,
                               batch_first=True,
                               num_layers=n_layers,
                               dropout=dropout)
        self.linear = nn.Linear(
            self.hidden_size if self.sum_lstm_layers else self.hidden_size *
            self.n_layers, self.output_size)
Пример #3
0
    def __init__(self, num_classes,
                 # re-ID
                 num_train_pids, cls_type="", cat_c4=False,
                 # Transform parameters
                 min_size=800, max_size=1333,
                 image_mean=None, image_std=None,
                 # RPN parameters
                 rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000,
                 rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000,
                 rpn_nms_thresh=0.7,
                 rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,
                 rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,
                 # Box parameters
                 box_score_thresh=0.05,
                 box_nms_thresh=0.5, box_detections_per_img=100,
                 # box training
                 box_fg_iou_thresh=0.5,
                 box_bg_iou_thresh=0.5,
                 box_batch_size_per_image=512, box_positive_fraction=0.25,
                 bbox_reg_weights=None,
                 # Misc
                 eval_gt=False, display=False, cws=False,
                 ):

        super(OIM, self).__init__()

        # ------- Backbone -------
        stem, top = _split_backbone('resnet50', load_bgr=True)
        top.representation_size = 2048
        self.backbone = stem

        # ------- RPN -------
        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)
        rpn_kwargs = [
            rpn_fg_iou_thresh, rpn_bg_iou_thresh,
            rpn_batch_size_per_image, rpn_positive_fraction,
            rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh
        ]

        rpn_anchor_generator = AnchorGenerator(
            sizes=((8, 16, 32),),
            aspect_ratios=((1, 2),))

        self.rpn = RegionProposalNetwork(
            rpn_anchor_generator,
            RPNHead(in_channels=1024,
                    num_anchors=rpn_anchor_generator.num_anchors_per_location()[0]),
            *rpn_kwargs
        )

        # ------- Box -------
        self.roi_align = MultiScaleRoIAlign(
            featmap_names=["C4"],
            output_size=(14, 7),
            sampling_ratio=0
        )
        representation_size = top.representation_size
        box_predictor = FastRCNNPredictor(representation_size, num_classes)

        box_kwargs = [
            # Faster R-CNN training
            box_fg_iou_thresh, box_bg_iou_thresh,
            box_batch_size_per_image, box_positive_fraction,
            bbox_reg_weights,
            # Faster R-CNN inference
            box_score_thresh, box_nms_thresh, box_detections_per_img
        ]
        embedding_head = ExtractReIDFeat(
            featmap_names=['C4', 'C5'] if cat_c4 else ['C5'],
            in_channels=[1024, 2048] if cat_c4 else [2048],
            dim=256
        )
        reid_loss = CriterionReID(
            cls_type,
            256,
            num_train_pids
        )
        feat_head = RCNNConvHead(top)

        self.roi_heads = RoIHeads(
            embedding_head, reid_loss,
            self.roi_align, feat_head, box_predictor,
            *box_kwargs
        )
        self.roi_heads.cws = cws

        self.req_pid = -1 if cls_type == "oim" else 0
        self.reid_time = 0

        # ------- Misc -------
        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]  # NOTE: RGB order is given here
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]

        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
        self.eval_gt = eval_gt
        self.display = display
Пример #4
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            num_pids=5532,
            num_cq_size=5000,
            # transform parameters
            min_size=900,
            max_size=1500,
            image_mean=None,
            image_std=None,
            # Anchor settings:
            anchor_scales=None,
            anchor_ratios=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=12000,
            rpn_pre_nms_top_n_test=6000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=300,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            rcnn_bbox_bn=True,
            box_roi_pool=None,
            feat_head=None,
            box_predictor=None,
            box_score_thresh=0.0,
            box_nms_thresh=0.4,
            box_detections_per_img=300,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.1,
            box_batch_size_per_image=128,
            box_positive_fraction=0.5,
            bbox_reg_weights=None,
            # ReID parameters
            embedding_head=None,
            reid_loss=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                'backbone should contain an attribute out_channels '
                'specifying the number of output channels (assumed to be the '
                'same for all the levels)')

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    'num_classes should be None when box_predictor is specified'
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    'num_classes should not be None when box_predictor'
                    'is not specified')

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            if anchor_scales is None:
                anchor_scales = ((32, 64, 128, 256, 512), )
            if anchor_ratios is None:
                anchor_ratios = ((0.5, 1.0, 2.0), )
            rpn_anchor_generator = AnchorGenerator(anchor_scales,
                                                   anchor_ratios)

        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = self._set_rpn(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh,
                            rpn_bg_iou_thresh, rpn_batch_size_per_image,
                            rpn_positive_fraction, rpn_pre_nms_top_n,
                            rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat_res4'],
                                              output_size=14,
                                              sampling_ratio=2)

        if feat_head is None:
            raise ValueError('feat_head should be specified manually.')

        if box_predictor is None:
            box_predictor = CoordRegressor(2048, num_classes, rcnn_bbox_bn)

        if embedding_head is None:
            embedding_head = NormAwareEmbeddingProj(
                featmap_names=['feat_res4', 'feat_res5'],
                in_channels=[1024, 2048],
                dim=256)

        if reid_loss is None:
            reid_loss = OIMLoss(256, num_pids, num_cq_size, 0.5, 30.0)

        roi_heads = self._set_roi_heads(
            embedding_head, reid_loss, box_roi_pool, feat_head, box_predictor,
            box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image,
            box_positive_fraction, bbox_reg_weights, box_score_thresh,
            box_nms_thresh, box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(FasterRCNN_NormAware, self).__init__(backbone, rpn, roi_heads,
                                                   transform)
Пример #5
0
    def __init__(self, correlation_args, batch_norm, conv_channels,
                 n_box_channels, roi_output_size, avg_box_features,
                 hidden_size, input_length, n_layers, dropout,
                 correlation_only, use_env_features, fixed_env,
                 correlation_last_only, sum_lstm_layers, refine_correlation,
                 max_box_features, use_roi_align, use_pre_conv):
        super().__init__(correlation_args, batch_norm, conv_channels,
                         n_box_channels, roi_output_size, avg_box_features,
                         hidden_size, input_length, n_layers, dropout,
                         correlation_only, use_env_features, fixed_env,
                         correlation_last_only, sum_lstm_layers,
                         max_box_features)

        assert correlation_args['stride'] == 1

        self.refine_correlation = refine_correlation
        self.use_roi_align = use_roi_align
        self.use_pre_conv = use_pre_conv
        self.conv_reduce = conv(self.batch_norm,
                                correlation_args['patch_size']**2,
                                32,
                                kernel_size=1,
                                stride=1)

        self.roi_output_size_ext = roi_output_size + (
            (correlation_args['patch_size'] - 1) *
            correlation_args['dilation_patch'])
        self.roi_output_size_env = roi_output_size * 3
        self.roi_output_size_env_ext = \
            self.roi_output_size_env + ((correlation_args['patch_size'] - 1) * correlation_args['dilation_patch'])

        self.roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                           output_size=self.roi_output_size,
                                           sampling_ratio=2)
        self.roi_pool_ext = MultiScaleRoIAlign(
            featmap_names=[0, 1, 2, 3],
            output_size=self.roi_output_size_ext,
            sampling_ratio=2)
        self.roi_pool_env_ext = MultiScaleRoIAlign(
            featmap_names=[0, 1, 2, 3],
            output_size=self.roi_output_size_env_ext,
            sampling_ratio=2)

        if self.fixed_env:
            locations_per_box = 1 if (self.avg_box_features
                                      or self.max_box_features) else (
                                          roi_output_size * 3)**2
        else:
            locations_per_box = 1 if (
                self.avg_box_features
                or self.max_box_features) else roi_output_size**2
        multiplier = 2 if self.use_env_features else 1
        self.input_size = 6 + (n_box_channels * locations_per_box * multiplier)

        if self.use_pre_conv:
            self.pre_conv = conv(self.batch_norm, 256, 128)

        # layers inspired from https://github.com/ClementPinard/FlowNetPytorch/blob/master/models/FlowNetC.py
        self.conv_redir = conv(self.batch_norm,
                               256,
                               32,
                               kernel_size=1,
                               stride=1)
        in_planes = (self.correlation_args['patch_size']**
                     2) + (0 if self.correlation_only else 32)
        self.conv3_1 = conv(self.batch_norm, in_planes, self.conv_channels)
        self.conv4 = conv(self.batch_norm, self.conv_channels,
                          self.conv_channels)
        self.conv4_1 = conv(self.batch_norm, self.conv_channels,
                            self.n_box_channels)

        # recurrent layers
        self.encoder = nn.LSTM(self.input_size,
                               self.hidden_size,
                               batch_first=True,
                               num_layers=n_layers,
                               dropout=dropout)
        self.attn = nn.Linear(self.hidden_size + self.input_size,
                              self.input_length)
        self.attn_combine = nn.Linear(self.hidden_size + self.output_size,
                                      self.hidden_size)
        self.decoder = nn.LSTM(self.input_size,
                               self.hidden_size,
                               batch_first=True,
                               num_layers=n_layers,
                               dropout=dropout)
        self.linear = nn.Linear(
            self.hidden_size if self.sum_lstm_layers else self.hidden_size *
            self.n_layers, self.output_size)
    def __init__(self, backbone, num_classes=None,
                 # transform parameter
                 min_size=800, max_size=1333,      # 预处理resize时限制的最小尺寸与最大尺寸
                 image_mean=None, image_std=None,  # 预处理normalize时使用的均值和方差
                 # RPN parameters
                 rpn_anchor_generator=None, rpn_head=None,
                 rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000,    # rpn中在nms处理前保留的proposal数(根据score)
                 rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000,  # rpn中在nms处理后保留的proposal数
                 rpn_nms_thresh=0.7,  # rpn中进行nms处理时使用的iou阈值
                 rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3,  # rpn计算损失时,采集正负样本设置的阈值
                 rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,  # rpn计算损失时采样的样本数,以及正负样本比例
                 # Box parameters
                 box_roi_pool=None, box_head=None, box_predictor=None,
                 # 移除低目标概率      fast rcnn中进行nms处理的阈值   对预测结果根据score排序取前100个目标
                 box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
                 box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,   # fast rcnn计算误差时,采集正负样本设置的阈值
                 box_batch_size_per_image=512, box_positive_fraction=0.25,  # fast rcnn计算误差时采样的样本数,以及正负样本比例
                 bbox_reg_weights=None):
        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels"
                "specifying the number of output channels  (assumed to be the"
                "same for all the levels"
            )

        assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError("num_classes should be None when box_predictor "
                                 "is specified")
        else:
            if box_predictor is None:
                raise ValueError("num_classes should not be None when box_predictor "
                                 "is not specified")

        # 预测特征层的channels
        out_channels = backbone.out_channels

        # 若anchor生成器为空,则自动生成针对resnet50_fpn的anchor生成器
        if rpn_anchor_generator is None:
            anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
            rpn_anchor_generator = AnchorsGenerator(
                anchor_sizes, aspect_ratios
            )

        # 生成RPN通过滑动窗口预测网络部分
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
            )

        # 默认rpn_pre_nms_top_n_train = 2000, rpn_pre_nms_top_n_test = 1000,
        # 默认rpn_post_nms_top_n_train = 2000, rpn_post_nms_top_n_test = 1000,
        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test)

        # 定义整个RPN框架
        rpn = RegionProposalNetwork(
            rpn_anchor_generator, rpn_head,
            rpn_fg_iou_thresh, rpn_bg_iou_thresh,
            rpn_batch_size_per_image, rpn_positive_fraction,
            rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)

        #  Multi-scale RoIAlign pooling
        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],  # 在哪些特征层进行预测
                output_size=[7, 7],
                sampling_ratio=2)

        # fast RCNN中roi pooling后的两个全连接层部分
        if box_head is None:
            resolution = box_roi_pool.output_size[0]  # 默认等于7
            representation_size = 1024
            box_head = TwoMLPHead(
                out_channels * resolution ** 2,
                representation_size
            )

        # 在box_head的输出上预测部分
        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(
                representation_size,
                num_classes)

        # 将roi pooling, box_head以及box_predictor结合在一起
        roi_heads = RoIHeads(
            # box
            box_roi_pool, box_head, box_predictor,
            box_fg_iou_thresh, box_bg_iou_thresh,
            box_batch_size_per_image, box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh, box_nms_thresh, box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]

        # 对数据进行标准化,缩放,打包成batch等处理部分
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
Пример #7
0
def training(args):

    # Random seed
    random.seed(42)

    # Device setting
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data pre-setting
    dat = pd.read_csv(os.path.join(args.data_path, 'train_df.csv'))
    index_list = list(range(len(dat)))
    random.shuffle(index_list)
    valid_count = int(len(index_list) * args.split)
    train_df = dat.iloc[index_list[:-valid_count]]
    valid_df = dat.iloc[index_list[-valid_count:]]

    # Transform setting
    transforms_dict = {
        'train':
        A.Compose([
            A.ShiftScaleRotate(
                shift_limit=0.2, scale_limit=0.2, rotate_limit=30, p=0.3),
            A.HorizontalFlip(p=0.3),
            A.RGBShift(
                r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.3),
            A.RandomBrightnessContrast(p=0.3),
            ToTensorV2()
        ],
                  bbox_params=A.BboxParams(format='pascal_voc',
                                           label_fields=['labels']),
                  keypoint_params=A.KeypointParams(format='xy',
                                                   remove_invisible=False,
                                                   angle_in_degrees=True)),
        'valid':
        A.Compose([ToTensorV2()],
                  bbox_params=A.BboxParams(format='pascal_voc',
                                           label_fields=['labels']),
                  keypoint_params=A.KeypointParams(format='xy',
                                                   remove_invisible=False,
                                                   angle_in_degrees=True))
    }

    # PyTorch dataloader setting
    dataset_dict = {
        'train':
        KeypointDataset(os.path.join(args.data_path, 'train_imgs/'), train_df,
                        transforms_dict['train']),
        'valid':
        KeypointDataset(os.path.join(args.data_path, 'train_imgs/'), valid_df,
                        transforms_dict['valid']),
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   batch_size=args.batch_size,
                   shuffle=True,
                   num_workers=args.num_workers,
                   collate_fn=collate_fn),
        'valid':
        DataLoader(dataset_dict['valid'],
                   batch_size=args.batch_size,
                   shuffle=True,
                   num_workers=args.num_workers,
                   collate_fn=collate_fn),
    }

    # Model setting
    backbone = resnet_fpn_backbone('resnet101', pretrained=True)
    roi_pooler = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'],
                                    output_size=7,
                                    sampling_ratio=2)

    keypoint_roi_pooler = MultiScaleRoIAlign(
        featmap_names=['0', '1', '2', '3'], output_size=14, sampling_ratio=2)
    model = KeypointRCNN(backbone,
                         num_classes=2,
                         num_keypoints=24,
                         box_roi_pool=roi_pooler,
                         keypoint_roi_pool=keypoint_roi_pooler)
    model = model.to(device)

    # Optimizer setting
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.w_decay)
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='min',
                                  factor=0.1,
                                  patience=len(dataloader_dict['train']) / 1.5)

    # Resume
    start_epoch = 0
    if args.resume:
        print('resume!')
        checkpoint = torch.load(args.file_name, map_location='cpu')
        start_epoch = checkpoint['epoch'] + 1
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        model = model.to(device)

    # Train start

    best_val_rmse = None

    for epoch in range(start_epoch, args.num_epochs):
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            if phase == 'valid':
                print('Validation start...')
                model.eval()
                val_rmse = 0
            for i, (images,
                    targets) in enumerate(tqdm(dataloader_dict[phase])):
                # Optimizer setting
                optimizer.zero_grad()

                # Input, output setting
                images = list(image.to(device) for image in images)
                targets = [{k: v.to(device)
                            for k, v in t.items()} for t in targets]

                with torch.set_grad_enabled(phase == 'train'):
                    losses = model(images, targets)
                    if phase == 'train':
                        loss = sum(loss for loss in losses.values())
                        loss.backward()
                        clip_grad_norm_(model.parameters(), args.grad_clip)
                        optimizer.step()

                        if (i + 1) % 100 == 0:
                            print(
                                f'| epoch: {epoch} | lr: {optimizer.param_groups[0]["lr"]} | loss: {loss.item():.4f}',
                                end=' | ')
                            for k, v in losses.items():
                                print(f'{k[5:]}: {v.item():.4f}', end=' | ')
                            print()
                    if phase == 'valid':
                        for i, l in enumerate(losses):
                            pred_ = l['keypoints'][0][:, :2].detach().cpu(
                            ).numpy().reshape(-1)
                            target_ = targets[i]['keypoints'][0][:, :2].cpu(
                            ).numpy().reshape(-1)
                            val_rmse += np.sqrt(((pred_ - target_)**2).mean())

            if phase == 'valid':
                val_rmse /= len(dataloader_dict[phase])
                print(f'Validation RMSE: {val_rmse}')
                if not best_val_rmse or val_rmse < best_val_rmse:
                    print('Checkpoint saving...')
                    torch.save(
                        {
                            'epoch': epoch,
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'scheduler': scheduler.state_dict(),
                        }, args.file_name)
                    best_val_rmse = val_rmse
    def __init__(self,
                 out_channels,
                 num_classes,
                 input_mode,
                 acf_head,
                 fg_iou_thresh=0.5,
                 bg_iou_thresh=0.5,
                 batch_size_per_image=512,
                 positive_fraction=0.25,
                 bbox_reg_weights=None,
                 box_score_thresh=0.05,
                 box_nms_thresh=0.5,
                 box_detections_per_img=100):
        super(RoIHeadsExtend, self).__init__()

        self.in_channels = out_channels
        self.input_mode = input_mode
        self.score_thresh = box_score_thresh
        self.nms_thresh = box_nms_thresh
        self.detections_per_img = box_detections_per_img
        self.fg_iou_thresh = fg_iou_thresh
        self.bg_iou_thresh = bg_iou_thresh
        self.batch_size_per_image = batch_size_per_image
        self.positive_fraction = positive_fraction
        self.num_classes = num_classes

        # Detection
        self.box_similarity = box_ops.box_iou
        # assign ground-truth boxes for each proposal
        self.proposal_matcher = det_utils.Matcher(
            fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
            batch_size_per_image, positive_fraction)

        if bbox_reg_weights is None:
            bbox_reg_weights = (10., 10., 5., 5.)
        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)

        self.box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                               output_size=7,
                                               sampling_ratio=2)

        representation_size = 1024
        resolution = self.box_roi_pool.output_size[0]
        self.box_head = TwoMLPHead(out_channels * resolution**2,
                                   representation_size)

        self.box_predictor = FastRCNNPredictor(representation_size,
                                               num_classes)

        # Segmentation
        self.shared_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                                  output_size=14,
                                                  sampling_ratio=2)
        resolution = self.shared_roi_pool.output_size[0]

        mask_layers = (256, 256, 256, 256, 256, 256, 256, 256)
        mask_dilation = 1
        self.mask_head = MaskRCNNHeads(out_channels, mask_layers,
                                       mask_dilation)

        mask_predictor_in_channels = 256  # == mask_layers[-1]
        mask_dim_reduced = 256
        self.mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels,
                                                mask_dim_reduced, num_classes)

        self.with_paf_branch = True
        if self.with_paf_branch:
            self.paf_head = MaskRCNNHeads(out_channels, mask_layers,
                                          mask_dilation)
            self.paf_predictor = MaskRCNNPredictor(mask_predictor_in_channels,
                                                   mask_dim_reduced,
                                                   2 * (num_classes - 1))

        if self.input_mode == config.INPUT_RGBD:
            self.attention_block = ContextBlock(256, 2)
            self.global_feature_dim = 256
            self.with_3d_keypoints = True
            self.with_axis_keypoints = False
            self.regress_axis = False
            self.estimate_norm_vector = False
            if acf_head == 'endpoints':
                self.with_axis_keypoints = True
            elif acf_head == 'scatters':
                self.regress_axis = True
            elif acf_head == 'norm_vector':
                self.estimate_norm_vector = True
            else:
                print("Don't assign a vaild acf head")
                exit()
            keypoint_layers = (256, ) * 4
            self.keypoint_dim_reduced = keypoint_layers[-1]
            if self.with_3d_keypoints:
                self.vote_keypoint_head = Vote_Kpoints_head(
                    self.global_feature_dim, keypoint_layers, "conv2d")
                self.vote_keypoint_predictor = Vote_Kpoints_Predictor(
                    self.keypoint_dim_reduced, 3 * (num_classes - 1))
            if self.with_axis_keypoints:
                self.orientation_keypoint_head = Vote_Kpoints_head(
                    self.global_feature_dim, keypoint_layers, "conv2d")

                self.orientation_keypoint_predictor = Vote_Kpoints_Predictor(
                    self.keypoint_dim_reduced, 6 * (num_classes - 1))

            if self.regress_axis:
                self.axis_head = Vote_Kpoints_head(self.global_feature_dim,
                                                   keypoint_layers, "conv2d")
                self.axis_predictor = Vote_Kpoints_Predictor(
                    self.keypoint_dim_reduced, 4 * (num_classes - 1))

            if self.estimate_norm_vector:
                self.norm_vector_head = Vote_Kpoints_head(
                    self.global_feature_dim, keypoint_layers, "conv2d")
                self.norm_vector_predictor = Vote_Kpoints_Predictor(
                    self.keypoint_dim_reduced, 3 * (num_classes - 1))
Пример #9
0
    def __init__(
            self,
            backbone,
            num_ID,
            num_classes=2,
            len_embeddings=128,
            # transform parameters
            min_size=720,
            max_size=960,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.5,
            rpn_bg_iou_thresh=0.4,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256,
                                                                       362))
            aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                              output_size=7,
                                              sampling_ratio=2)

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1

        if box_predictor is None:
            representation_size = 1024
            box_predictor = JDEPredictor(representation_size, num_classes,
                                         len_embeddings, emb_scale)

        roi_heads = JDE_RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            len_embeddings,
            num_ID)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform)
        self.eval_embed = False
Пример #10
0
    def __init__(self, config):
        super(Mask_Rcnn, self).__init__()
        self.config = config
        self.Mean = torch.tensor(config.Mean, dtype=torch.float32)
        self.num_anchor = len(config.anchor_scales) * len(config.anchor_ratios)
        self.anchors = []
        self.num_anchor = []
        for i in range(5):
            self.num_anchor.append(
                len(config.anchor_scales[i]) * len(config.anchor_ratios[i]))
            stride = 4 * 2**i
            print(stride, self.config.anchor_scales[i],
                  self.config.anchor_ratios[i])
            anchors = get_anchors(np.ceil(self.config.img_max / stride + 1),
                                  self.config.anchor_scales[i],
                                  self.config.anchor_ratios[i],
                                  stride=stride)
            print(anchors.shape)
            self.anchors.append(anchors)
        self.ATC = AnchorTargetCreator(
            n_sample=config.rpn_n_sample,
            pos_iou_thresh=config.rpn_pos_iou_thresh,
            neg_iou_thresh=config.rpn_neg_iou_thresh,
            pos_ratio=config.rpn_pos_ratio)
        self.PC = ProposalCreator(nms_thresh=config.roi_nms_thresh,
                                  n_train_pre_nms=config.roi_train_pre_nms,
                                  n_train_post_nms=config.roi_train_post_nms,
                                  n_test_pre_nms=config.roi_test_pre_nms,
                                  n_test_post_nms=config.roi_test_post_nms,
                                  min_size=config.roi_min_size)
        self.PTC_1 = ProposalTargetCreator_box(
            n_sample=config.fast_n_sample,
            pos_ratio=config.fast_pos_ratio,
            pos_iou_thresh=config.fast_pos_iou_thresh,
            neg_iou_thresh_hi=config.fast_neg_iou_thresh_hi,
            neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo)
        self.PTC_2 = ProposalTargetCreator_box(
            n_sample=config.fast_n_sample,
            pos_ratio=config.fast_pos_ratio,
            pos_iou_thresh=0.6,
            neg_iou_thresh_hi=0.6,
            neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo)
        self.PTC = ProposalTargetCreator(
            n_sample=config.fast_n_sample,
            pos_ratio=config.fast_pos_ratio,
            pos_iou_thresh=0.7,
            neg_iou_thresh_hi=0.7,
            neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo)

        self.features = resnet50()
        self.fpn = FPN_net([256, 512, 1024, 2048],
                           256,
                           extra_blocks=LastLevelMaxPool())
        self.rpn = RPN_net(256, self.num_anchor[0])
        self.roialign_7 = MultiScaleRoIAlign(
            ['feat0', 'feat1', 'feat2', 'feat3'], 7, 2)
        self.roialign_14 = MultiScaleRoIAlign(
            ['feat0', 'feat1', 'feat2', 'feat3'], 14, 2)
        # self.roialign_28 = RoIAlign((28, 28), 1.0, 2)
        self.fast = Fast_net(config.num_cls, 256 * 7 * 7, 1024)
        self.fast_2 = Fast_net(config.num_cls, 256 * 7 * 7, 1024)
        self.fast_3 = Fast_net(config.num_cls, 256 * 7 * 7, 1024)
        self.mask_net = Mask_net(256, config.num_cls)
        self.a = 0
        self.b = 0
        self.c = 0
        self.d = 0
        self.fast_num = 0
        self.fast_num_P = 0

        self.loc_std1 = [1. / 10, 1. / 10, 1. / 5, 1. / 5]
        self.loc_std2 = [1. / 20, 1. / 20, 1. / 10, 1. / 10]
        self.loc_std3 = [1. / 30, 1. / 30, 1. / 15, 1. / 15]
        self.loss_weights = [1.0, 0.5, 0.25]
Пример #11
0
    def __init__(
            self,
            object_to_action,
            human_idx,
            # Backbone parameters
            backbone_name="resnet50",
            pretrained=True,
            # Pooler parameters
            output_size=7,
            sampling_ratio=2,
            # Box pair head parameters
            node_encoding_size=1024,
            representation_size=1024,
            num_classes=117,
            fg_iou_thresh=0.5,
            num_iterations=1,
            # Transformation parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            postprocess=True,
            # Preprocessing parameters
            box_nms_thresh=0.5,
            max_human=15,
            max_object=15):

        backbone = models.fasterrcnn_resnet_fpn(backbone_name,
                                                pretrained=pretrained).backbone

        box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                          output_size=output_size,
                                          sampling_ratio=sampling_ratio)

        box_pair_head = GraphHead(
            out_channels=backbone.out_channels,
            roi_pool_size=output_size,
            node_encoding_size=node_encoding_size,
            representation_size=representation_size,
            num_cls=num_classes,
            human_idx=human_idx,
            object_class_to_target_class=object_to_action,
            fg_iou_thresh=fg_iou_thresh,
            num_iter=num_iterations)

        box_pair_predictor = nn.Linear(representation_size * 2, num_classes)

        interaction_head = InteractionHead(
            box_roi_pool=box_roi_pool,
            box_pair_head=box_pair_head,
            box_pair_predictor=box_pair_predictor,
            num_classes=num_classes,
            human_idx=human_idx,
            box_nms_thresh=box_nms_thresh,
            max_human=max_human,
            max_object=max_object)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = HOINetworkTransform(min_size, max_size, image_mean,
                                        image_std)

        super().__init__(backbone, interaction_head, transform, postprocess)
    def __init__(
            self,
            backbone,
            num_classes=2,
            # Faster and Mask R-CNN
            min_size=512,
            max_size=512,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=400,
            rpn_pre_nms_top_n_test=400,
            rpn_post_nms_top_n_train=200,
            rpn_post_nms_top_n_test=200,
            rpn_nms_thresh=0.75,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.75,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.75,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=256,
            box_positive_fraction=0.75,
            bbox_reg_weights=None,
            # Mask parameters
            mask_roi_pool=None,
            mask_head=None,
            mask_predictor=None,
            # Alex - SSM
            box_score_thresh_classifier=-0.01,
            box_nms_thresh_classifier=0.25,
            box_detections_per_img_s2new=8,
            # Alex - Mask+Box Features extractor,
            box_pool_s2=None,
            box_head_s2=None,
            box_predictor_s2=None,
            mask_pool_s2=None,
            mask_head_s2=None,
            mask_predictor_s2=None,
            # Alex - Affinity model
            x_stages=3,
            num_classes_img=3,
            sieve_layer=None,
            s2classifier=None,
            num_affinities=256,
            affinity=None,
            s2new_classifier=None,
            **kwargs):

        out_channels = backbone.out_channels
        # Mask features branch

        # Classification branch
        if box_pool_s2 is None:
            box_pool_s2 = MultiScaleRoIAlign(
                # single feature map
                featmap_names=['0'],
                output_size=7,
                sampling_ratio=2)

        if box_head_s2 is None:
            resolution = box_pool_s2.output_size[0]
            representation_size = 128
            box_head_s2 = TwoMLPHead(out_channels * resolution**2,
                                     representation_size)

        if box_predictor_s2 is None:
            representation_size = 128
            box_predictor_s2 = FastRCNNPredictor(representation_size,
                                                 num_classes)

        if mask_pool_s2 is None:
            mask_pool_s2 = MultiScaleRoIAlign(
                #Alex: the key of the feature map
                featmap_names=['0'],
                output_size=14,
                sampling_ratio=2)

        if mask_head_s2 is None:
            mask_layers = (out_channels, )
            mask_dilation = 1
            mask_head_s2 = MaskRCNNHeads(out_channels, mask_layers,
                                         mask_dilation)

        # add mask predictor: upsample+bn+relu
        if mask_predictor_s2 is None:
            in_channels = mask_head_s2[-2].out_channels
            out_channels = in_channels
            mask_predictor_s2 = MaskRCNNPredictorTruncated(
                in_channels, out_channels, mask_dilation)

        # Affinity layer,
        num_feature_maps = mask_predictor_s2.conv_reduce.out_channels
        num_reduce_feature_maps = int(num_feature_maps / 2)
        if sieve_layer is None:
            sieve_layer = MaskFeaturesSieve(
                num_feature_maps=num_feature_maps,
                num_reduce_feature_maps=num_reduce_feature_maps,
                h=28,
                w=28,
                apply_linearity=False,
                final=False)
            affinity_layer = AffinityLayer(
                sieve_layer,
                affinity_matrix_size=box_detections_per_img_s2new,
                x_stages=x_stages,
                num_features=num_feature_maps,
                num_affinities=num_affinities)

        # Image classification batch
        if s2classifier is None:
            s2classifier = ImageClassificationLayerFromMaskFeatures(
                affinity_feature_size=num_feature_maps,
                num_classes_img=num_classes_img)
        # instantiate Mask R-CNN:
        # affinity and image classificiaotn module will be passed to the Generalized RCNN
        kwargs.update(affinity=affinity_layer, s2new_classifier=s2classifier)
        super(AffinityModel, self).__init__(
            backbone,
            num_classes,
            # transform parameters
            min_size,
            max_size,
            image_mean,
            image_std,
            # RPN parameters
            rpn_anchor_generator,
            rpn_head,
            rpn_pre_nms_top_n_train,
            rpn_pre_nms_top_n_test,
            rpn_post_nms_top_n_train,
            rpn_post_nms_top_n_test,
            rpn_nms_thresh,
            rpn_fg_iou_thresh,
            rpn_bg_iou_thresh,
            rpn_batch_size_per_image,
            rpn_positive_fraction,
            # Box parameters
            box_roi_pool,
            box_head,
            box_predictor,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            # Mask parameters
            mask_roi_pool=None,
            mask_head=None,
            mask_predictor=None,
            **kwargs)
        # Alex - SSM
        #
        self.roi_heads.score_thresh_classifier = box_score_thresh_classifier
        self.roi_heads.nms_thresh_classifier = box_nms_thresh_classifier
        self.roi_heads.detections_per_img_s2new = box_detections_per_img_s2new
        #
        #
        self.roi_heads.box_pool_s2 = box_pool_s2
        self.roi_heads.box_head_s2 = box_head_s2
        self.roi_heads.box_predictor_s2 = box_predictor_s2
        #
        # Alex - Mask Features extractor,
        self.roi_heads.mask_pool_s2 = mask_pool_s2
        self.roi_heads.mask_head_s2 = mask_head_s2
        self.roi_heads.mask_predictor_s2 = mask_predictor_s2
Пример #13
0
    def __init__(
        self,
        config,
        # backbone,
        # neck,
        # head,
        min_size=800,
        max_size=1333,
        preserve_aspect_ratio=True,
        rpn_anchor_generator=None,
        rpn_head=None,
        rpn_pre_nms_top_n_train=2000,
        rpn_pre_nms_top_n_test=1000,
        rpn_post_nms_top_n_train=2000,
        rpn_post_nms_top_n_test=1000,
        rpn_nms_thresh=0.7,
        rpn_fg_iou_thresh=0.7,
        rpn_bg_iou_thresh=0.3,
        rpn_batch_size_per_image=256,
        rpn_positive_fraction=0.5,
        rpn_score_thresh=0.0,
        # Box parameters
        box_roi_pool=None,
        box_head=None,
        box_predictor=None,
        box_score_thresh=0.05,
        box_nms_thresh=0.5,
        box_detections_per_img=100,
        box_fg_iou_thresh=0.5,
        box_bg_iou_thresh=0.5,
        box_batch_size_per_image=512,
        box_positive_fraction=0.25,
        bbox_reg_weights=None,
        anchor_sizes=(32, 64, 128, 256, 512),
        aspect_ratios=(0.5, 1.0, 2.0)
    ) -> None:
        super().__init__(config)
        self.num_classes = 91
        self.preserve_aspect_ratio = preserve_aspect_ratio

        self.transform = RcnnTransform(800, 1333, None, None)
        self.backbone = resnet50()
        self.neck = FasterRcnnNeck(config, self.backbone.channels)

        anchor_sizes = tuple((anchor,) for anchor in anchor_sizes)
        aspect_ratios = (aspect_ratios,) * len(anchor_sizes)
        print(anchor_sizes)
        print(aspect_ratios)

        rpn_anchor_generator = AnchorGenerator(
            anchor_sizes, aspect_ratios
        )

        out_channels = self.neck.channels[-1]
        rpn_head = RpnHead(
                out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
        )

        rpn_pre_nms_top_n = {
            'training': rpn_pre_nms_top_n_train, 'testing': rpn_pre_nms_top_n_test
        }
        rpn_post_nms_top_n = {
            'training': rpn_post_nms_top_n_train, 'testing': rpn_post_nms_top_n_test
        }

        self.rpn = RegionProposalNetwork(
            rpn_anchor_generator,
            rpn_head,
            rpn_fg_iou_thresh,
            rpn_bg_iou_thresh,
            rpn_batch_size_per_image,
            rpn_positive_fraction,
            rpn_pre_nms_top_n,
            rpn_post_nms_top_n,
            rpn_nms_thresh,
            score_thresh=rpn_score_thresh
        )

        box_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=7,
                sampling_ratio=2
        )

        resolution = box_roi_pool.output_size[0]
        representation_size = 1024

        box_head = LinearHead(
            out_channels * resolution ** 2,
            representation_size
        )

        representation_size = 1024
        box_predictor = FastRcnnPredictHead(
            representation_size,
            self.num_classes
        )

        self.roi_heads = RoiHeads(
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img
        )
Пример #14
0
    def __init__(
            self,
            object_to_action: List[list],
            human_idx: int,
            # Backbone parameters
            backbone_name: str = "resnet50",
            pretrained: bool = True,
            # Pooler parameters
            output_size: int = 7,
            sampling_ratio: int = 2,
            # Box pair head parameters
            node_encoding_size: int = 1024,
            representation_size: int = 1024,
            num_classes: int = 117,
            box_score_thresh: float = 0.2,
            fg_iou_thresh: float = 0.5,
            num_iterations: int = 2,
            distributed: bool = False,
            # Transformation parameters
            min_size: int = 800,
            max_size: int = 1333,
            image_mean: Optional[List[float]] = None,
            image_std: Optional[List[float]] = None,
            postprocess: bool = True,
            # Preprocessing parameters
            box_nms_thresh: float = 0.5,
            max_human: int = 15,
            max_object: int = 15) -> None:

        detector = models.fasterrcnn_resnet_fpn(backbone_name,
                                                pretrained=pretrained)
        backbone = detector.backbone

        box_roi_pool = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'],
                                          output_size=output_size,
                                          sampling_ratio=sampling_ratio)

        box_pair_head = GraphHead(
            out_channels=backbone.out_channels,
            roi_pool_size=output_size,
            node_encoding_size=node_encoding_size,
            representation_size=representation_size,
            num_cls=num_classes,
            human_idx=human_idx,
            object_class_to_target_class=object_to_action,
            fg_iou_thresh=fg_iou_thresh,
            num_iter=num_iterations)

        box_pair_predictor = nn.Linear(representation_size * 2, num_classes)
        box_pair_suppressor = nn.Linear(representation_size * 2, 1)

        interaction_head = InteractionHead(
            box_roi_pool=box_roi_pool,
            box_pair_head=box_pair_head,
            box_pair_suppressor=box_pair_suppressor,
            box_pair_predictor=box_pair_predictor,
            num_classes=num_classes,
            human_idx=human_idx,
            box_nms_thresh=box_nms_thresh,
            box_score_thresh=box_score_thresh,
            max_human=max_human,
            max_object=max_object,
            distributed=distributed)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = HOINetworkTransform(min_size, max_size, image_mean,
                                        image_std)

        super().__init__(backbone, interaction_head, transform, postprocess)
Пример #15
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if box_predictor is not None:
                raise ValueError(
                    "num_classes should be None when box_predictor is specified"
                )
        else:
            if box_predictor is None:
                raise ValueError(
                    "num_classes should not be None when box_predictor "
                    "is not specified")

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, ))
            aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=7,
                sampling_ratio=2)

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(representation_size, num_classes)

        roi_heads = RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img)

        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)

        super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
Пример #16
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        rpn_anchor_generator = task.rpn_anchor_generator
        rpn_head = task.rpn_head
        box_roi_pool = task.box_roi_pool
        box_predictor = task.box_predictor
        box_head = task.box_head

        # setup backbone
        backbone = resnet_fpn_backbone(args.backbone, args.backbone_pretrained)

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)"
            )

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        if task.num_classes > 0:
            if box_predictor is not None:
                raise ValueError("num_classes should be -1 when box_predictor is specified")
        else:
            if box_predictor is None:
                raise ValueError("num_classes should be > 0 when box_predictor is not specified")

        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
            anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0],
            )

        rpn_pre_nms_top_n = dict(training=args.rpn_pre_nms_top_n_train, testing=args.rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=args.rpn_post_nms_top_n_train, testing=args.rpn_post_nms_top_n_test)

        rpn = RPN(
            rpn_anchor_generator, rpn_head,
            args.rpn_fg_iou_thresh, args.rpn_bg_iou_thresh,
            args.rpn_batch_size_per_image, args.rpn_positive_fraction,
            rpn_pre_nms_top_n, rpn_post_nms_top_n, args.rpn_nms_thresh,
        )

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(
                featmap_names=[0, 1, 2, 3],
                output_size=7,
                sampling_ratio=2,
            )

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(
                out_channels * resolution ** 2,
                representation_size,
            )

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(
                representation_size,
                task.num_classes,
            )

        roi_heads = RegionOfInterestHeads(
            # Box
            box_roi_pool, box_head, box_predictor,
            args.box_fg_iou_thresh, args.box_bg_iou_thresh,
            args.box_batch_size_per_image, args.box_positive_fraction,
            args.bbox_reg_weights, args.box_score_thresh,
            args.box_nms_thresh, args.box_detections_per_img,
        )

        if args.image_mean is None:
            args.image_mean = [0.485, 0.456, 0.406]
        if args.image_std is None:
            args.image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(
            args.min_size, args.max_size,
            args.image_mean, args.image_std,
        )

        return cls(backbone, rpn, roi_heads, transform)
Пример #17
0
    def __init__(
            self,
            backbone,
            num_classes=None,
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None,
            # Mask parameters
            mask_roi_pool=None,
            mask_head=None,
            mask_predictor=None):

        assert isinstance(mask_roi_pool, (MultiScaleRoIAlign, type(None)))

        if num_classes is not None:
            if mask_predictor is not None:
                raise ValueError(
                    "num_classes should be None when mask_predictor is specified"
                )

        out_channels = backbone.out_channels

        if mask_roi_pool is None:
            mask_roi_pool = MultiScaleRoIAlign(
                featmap_names=['0', '1', '2', '3'],
                output_size=14,
                sampling_ratio=2)

        if mask_head is None:
            mask_layers = (256, 256, 256, 256)
            mask_dilation = 1
            mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation)

        if mask_predictor is None:
            mask_predictor_in_channels = 256  # == mask_layers[-1]
            mask_dim_reduced = 256
            mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels,
                                               mask_dim_reduced, num_classes)

        super(MaskRCNNIA, self).__init__(
            backbone,
            num_classes,
            # transform parameters
            min_size,
            max_size,
            image_mean,
            image_std,
            # RPN-specific parameters
            rpn_anchor_generator,
            rpn_head,
            rpn_pre_nms_top_n_train,
            rpn_pre_nms_top_n_test,
            rpn_post_nms_top_n_train,
            rpn_post_nms_top_n_test,
            rpn_nms_thresh,
            rpn_fg_iou_thresh,
            rpn_bg_iou_thresh,
            rpn_batch_size_per_image,
            rpn_positive_fraction,
            # Box parameters
            box_roi_pool,
            box_head,
            box_predictor,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights)

        self.roi_heads.mask_roi_pool = mask_roi_pool
        self.roi_heads.mask_head = mask_head
        self.roi_heads.mask_predictor = mask_predictor
Пример #18
0
    def __init__(
            self,
            num_classes,
            # re-ID
            num_train_pids,
            cls_type="",
            in_level=["C5"],
            # Transform
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.7,
            rpn_bg_iou_thresh=0.3,
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=512,
            box_positive_fraction=0.25,
            bbox_reg_weights=None,
            # Misc
            eval_gt=False,
            display=False,
            cws=False):

        super(BSL, self).__init__()
        # ------- Backbone -------
        base_model, top_model = _split_backbone(backbone_name='resnet50',
                                                conv5_stride=2)
        return_layers = {
            'conv1': "C1",
            'conv2': "C2",
            'conv3': "C3",
            'conv4_3': "C4",
        }
        self.backbone = DetectorBackbone(base_model, return_layers)

        # ------- RPN -------
        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)
        rpn_kwargs = [
            rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image,
            rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n,
            rpn_nms_thresh
        ]
        rpn_anchor_generator = AnchorGenerator(sizes=((8, 16, 32), ),
                                               aspect_ratios=((1, 2), ))
        self.RPN = RegionProposalNetwork(
            rpn_anchor_generator,
            RPNHead(1024,
                    rpn_anchor_generator.num_anchors_per_location()[0]),
            *rpn_kwargs)

        # ------- R-CNN -------
        roi_align = MultiScaleRoIAlign(featmap_names=["C4"],
                                       output_size=(14, 7),
                                       sampling_ratio=0)
        resolution_h, resolution_w = roi_align.output_size[
            0], roi_align.output_size[1]

        box_emb = EmbDet(1024, 256, resolutions=[resolution_h, resolution_w])
        box_predictor = FastRCNNPredictor(box_emb.representation_size,
                                          num_classes)

        box_kwargs = [
            # Faster R-CNN training
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            # Faster R-CNN inference
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img
        ]
        self.RCNN = RCNN(roi_align, box_emb, box_predictor, *box_kwargs)
        self.RCNN.cws = cws

        # ------- re-ID -------
        out_channels = 256
        in_ch_list = [2048, 1024, 512, 256, 256][:len(in_level)][::-1]
        reid_emb = EmbedReID(top_model,
                             roi_align,
                             featmap_names=in_level,
                             in_ch_list=in_ch_list,
                             out_ch=out_channels)
        reid_crit = nn.ModuleDict()
        for name, in_ch in zip(in_level, in_ch_list):
            reid_crit[name] = CriterionReID(cls_type, in_ch, num_train_pids)

        self.reid_head = ReIDHead(
            reid_emb,
            reid_crit,
            # PK sampling
            n_roi_per_gt=4,
            fg_iou_thresh=0.5)

        # -------- Others -------
        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]  # NOTE: RGB order is given here
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]

        self.transform = GeneralizedRCNNTransform(min_size, max_size,
                                                  image_mean, image_std)
        self.eval_gt = eval_gt
        self.display = display
Пример #19
0
    def __init__(
        self,
        backbone,
        num_classes=None,
        # transform parameters
        min_size=None,
        max_size=1333,
        image_mean=None,
        image_std=None,
        # RPN parameters
        rpn_anchor_generator=None,
        rpn_head=None,
        rpn_pre_nms_top_n_train=2000,
        rpn_pre_nms_top_n_test=1000,
        rpn_post_nms_top_n_train=2000,
        rpn_post_nms_top_n_test=1000,
        rpn_nms_thresh=0.7,
        rpn_fg_iou_thresh=0.7,
        rpn_bg_iou_thresh=0.3,
        rpn_batch_size_per_image=256,
        rpn_positive_fraction=0.5,
        rpn_score_thresh=0.0,
        # Box parameters
        box_roi_pool=None,
        box_head=None,
        box_predictor=None,
        box_score_thresh=0.05,
        box_nms_thresh=0.5,
        box_detections_per_img=100,
        box_fg_iou_thresh=0.5,
        box_bg_iou_thresh=0.5,
        box_batch_size_per_image=512,
        box_positive_fraction=0.25,
        bbox_reg_weights=None,
        # keypoint parameters
        keypoint_roi_pool=None,
        keypoint_head=None,
        keypoint_predictor=None,
        num_keypoints=None,
    ):

        if not isinstance(keypoint_roi_pool, (MultiScaleRoIAlign, type(None))):
            raise TypeError(
                "keypoint_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(keypoint_roi_pool)}"
            )
        if min_size is None:
            min_size = (640, 672, 704, 736, 768, 800)

        if num_keypoints is not None:
            if keypoint_predictor is not None:
                raise ValueError(
                    "num_keypoints should be None when keypoint_predictor is specified"
                )
        else:
            num_keypoints = 17

        out_channels = backbone.out_channels

        if keypoint_roi_pool is None:
            keypoint_roi_pool = MultiScaleRoIAlign(
                featmap_names=["0", "1", "2", "3"],
                output_size=14,
                sampling_ratio=2)

        if keypoint_head is None:
            keypoint_layers = tuple(512 for _ in range(8))
            keypoint_head = KeypointRCNNHeads(out_channels, keypoint_layers)

        if keypoint_predictor is None:
            keypoint_dim_reduced = 512  # == keypoint_layers[-1]
            keypoint_predictor = KeypointRCNNPredictor(keypoint_dim_reduced,
                                                       num_keypoints)

        super().__init__(
            backbone,
            num_classes,
            # transform parameters
            min_size,
            max_size,
            image_mean,
            image_std,
            # RPN-specific parameters
            rpn_anchor_generator,
            rpn_head,
            rpn_pre_nms_top_n_train,
            rpn_pre_nms_top_n_test,
            rpn_post_nms_top_n_train,
            rpn_post_nms_top_n_test,
            rpn_nms_thresh,
            rpn_fg_iou_thresh,
            rpn_bg_iou_thresh,
            rpn_batch_size_per_image,
            rpn_positive_fraction,
            rpn_score_thresh,
            # Box parameters
            box_roi_pool,
            box_head,
            box_predictor,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
        )

        self.roi_heads.keypoint_roi_pool = keypoint_roi_pool
        self.roi_heads.keypoint_head = keypoint_head
        self.roi_heads.keypoint_predictor = keypoint_predictor
Пример #20
0
    def __init__(
            self,
            backbone,
            num_ID,
            num_classes=2,
            version='v1',
            # transform parameters
            min_size=800,
            max_size=1333,
            image_mean=None,
            image_std=None,
            # RPN parameters
            rpn_anchor_generator=None,
            rpn_head=None,
            rpn_pre_nms_top_n_train=2000,
            rpn_pre_nms_top_n_test=1000,
            rpn_post_nms_top_n_train=2000,
            rpn_post_nms_top_n_test=1000,
            rpn_nms_thresh=0.7,
            rpn_fg_iou_thresh=0.5,
            rpn_bg_iou_thresh=0.4,  #FIXME 这两个参数是参照论文Towards Real-Time Multi-Object Tracking
            rpn_batch_size_per_image=256,
            rpn_positive_fraction=0.5,
            # Box parameters
            box_roi_pool=None,
            box_head=None,
            box_predictor=None,
            box_score_thresh=0.05,
            box_nms_thresh=0.5,
            box_detections_per_img=100,
            box_fg_iou_thresh=0.5,
            box_bg_iou_thresh=0.5,
            box_batch_size_per_image=256,
            box_positive_fraction=0.25,
            bbox_reg_weights=None,
            # Embedding parameters ##FIXME 添加的参数
            len_embeddings=128,
            embed_head=None,
            embed_extractor=None):

        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")

        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None)))

        out_channels = backbone.out_channels

        ##FIXME 改了anchor size,并且只使用宽高比1/3的anchor,参考了Towards Real-Time Multi-Object Tracking
        if rpn_anchor_generator is None:
            anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256,
                                                                       362))
            aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels,
                rpn_anchor_generator.num_anchors_per_location()[0])

        rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train,
                                 testing=rpn_pre_nms_top_n_test)
        rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train,
                                  testing=rpn_post_nms_top_n_test)

        rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head,
                                    rpn_fg_iou_thresh, rpn_bg_iou_thresh,
                                    rpn_batch_size_per_image,
                                    rpn_positive_fraction, rpn_pre_nms_top_n,
                                    rpn_post_nms_top_n, rpn_nms_thresh)

        if box_roi_pool is None:
            box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3],
                                              output_size=11,
                                              sampling_ratio=2)

        if box_head is None:
            resolution = box_roi_pool.output_size[0]
            representation_size = 1024
            box_head = TwoMLPHead(out_channels * resolution**2,
                                  representation_size)

        emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1

        ## FIXME 现在用的是v1
        if embed_head is None:
            if version == 'v1':
                resolution = box_roi_pool.output_size[0]
                representation_size = 1024
                embed_head = featureHead(out_channels * resolution**2,
                                         representation_size)
            if version == 'v2':
                embed_head = None

        if box_predictor is None:
            representation_size = 1024
            box_predictor = FastRCNNPredictor(representation_size, num_classes)

        if embed_extractor is None:
            representation_size = 1024
            embed_extractor = featureExtractor(representation_size,
                                               len_embeddings, emb_scale)

        roi_heads = JDE_RoIHeads(
            # Box
            box_roi_pool,
            box_head,
            box_predictor,
            box_fg_iou_thresh,
            box_bg_iou_thresh,
            box_batch_size_per_image,
            box_positive_fraction,
            bbox_reg_weights,
            box_score_thresh,
            box_nms_thresh,
            box_detections_per_img,
            len_embeddings,
            num_ID,
            embed_head,
            embed_extractor)
        roi_heads.version = version

        #FIXME 这一部分是照搬faster RCNN代码里面的###################
        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        transform = GeneralizedRCNNTransform(min_size, max_size, image_mean,
                                             image_std)
        ###########################################################

        super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform)
        ## FIXME 跟踪时用的参数,与训练无关
        self.version = version
        self.original_image_sizes = None
        self.preprocessed_images = None
        self.features = None
        self.box_features = None