def __init__( self, box_roi_pool, box_head, box_predictor, # Faster R-CNN training fg_iou_thresh, bg_iou_thresh, batch_size_per_image, positive_fraction, bbox_reg_weights, # Faster R-CNN inference score_thresh, nms_thresh, detections_per_img, out_channels, # Mask mask_roi_pool=None, mask_head=None, mask_predictor=None, keypoint_roi_pool=None, keypoint_head=None, keypoint_predictor=None, pose_mean=None, pose_stddev=None, threed_68_points=None, threed_5_points=None, bbox_x_factor=1.1, bbox_y_factor=1.1, expand_forehead=0.3, ): super(RoIHeads, self).__init__() self.box_similarity = box_ops.box_iou # assign ground-truth boxes for each proposal self.proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( batch_size_per_image, positive_fraction) if bbox_reg_weights is None: bbox_reg_weights = (10.0, 10.0, 5.0, 5.0) self.box_coder = det_utils.BoxCoder(bbox_reg_weights) self.box_roi_pool = box_roi_pool self.box_head = box_head self.box_predictor = box_predictor num_classes = 2 self.class_roi_pool = MultiScaleRoIAlign( featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2) resolution = box_roi_pool.output_size[0] representation_size = 1024 self.class_head = TwoMLPHead(out_channels * resolution**2, representation_size) self.class_predictor = FastRCNNClassPredictor(representation_size, num_classes) self.score_thresh = score_thresh self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img self.mask_roi_pool = mask_roi_pool self.mask_head = mask_head self.mask_predictor = mask_predictor self.keypoint_roi_pool = keypoint_roi_pool self.keypoint_head = keypoint_head self.keypoint_predictor = keypoint_predictor self.pose_mean = pose_mean self.pose_stddev = pose_stddev self.threed_68_points = threed_68_points self.threed_5_points = threed_5_points self.bbox_x_factor = bbox_x_factor self.bbox_y_factor = bbox_y_factor self.expand_forehead = expand_forehead
def __init__( self, correlation_args, batch_norm, conv_channels, n_box_channels, roi_output_size, avg_box_features, hidden_size, input_length, n_layers, dropout, correlation_only, use_env_features, fixed_env, correlation_last_only, sum_lstm_layers, max_box_features=False, use_pre_conv=False, ): super().__init__() self.correlation_args = correlation_args self.batch_norm = batch_norm self.conv_channels = conv_channels self.n_box_channels = n_box_channels self.roi_output_size = roi_output_size self.avg_box_features = avg_box_features self.hidden_size = hidden_size self.input_length = input_length self.n_layers = n_layers self.output_size = 6 self.dropout = dropout self.correlation_only = correlation_only self.use_env_features = use_env_features self.fixed_env = fixed_env self.correlation_last_only = correlation_last_only self.sum_lstm_layers = sum_lstm_layers self.max_box_features = max_box_features locations_per_box = 1 if ( self.avg_box_features or self.max_box_features) else roi_output_size**2 multiplier = 2 if self.use_env_features else 1 self.input_size = 6 + (n_box_channels * locations_per_box * multiplier) self.roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=roi_output_size, sampling_ratio=2) # layers inspired from https://github.com/ClementPinard/FlowNetPytorch/blob/master/models/FlowNetC.py self.conv_redir = conv(self.batch_norm, 256, 32, kernel_size=1, stride=1) in_planes = (self.correlation_args['patch_size']** 2) + (0 if self.correlation_only else 32) self.conv3_1 = conv(self.batch_norm, in_planes, self.conv_channels) self.conv4 = conv(self.batch_norm, self.conv_channels, self.conv_channels) self.conv4_1 = conv(self.batch_norm, self.conv_channels, self.n_box_channels) # recurrent layers self.encoder = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, num_layers=n_layers, dropout=dropout) self.attn = nn.Linear(self.hidden_size + self.input_size, self.input_length) self.attn_combine = nn.Linear(self.hidden_size + self.output_size, self.hidden_size) self.decoder = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, num_layers=n_layers, dropout=dropout) self.linear = nn.Linear( self.hidden_size if self.sum_lstm_layers else self.hidden_size * self.n_layers, self.output_size)
def __init__(self, num_classes, # re-ID num_train_pids, cls_type="", cat_c4=False, # Transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, # box training box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Misc eval_gt=False, display=False, cws=False, ): super(OIM, self).__init__() # ------- Backbone ------- stem, top = _split_backbone('resnet50', load_bgr=True) top.representation_size = 2048 self.backbone = stem # ------- RPN ------- rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn_kwargs = [ rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh ] rpn_anchor_generator = AnchorGenerator( sizes=((8, 16, 32),), aspect_ratios=((1, 2),)) self.rpn = RegionProposalNetwork( rpn_anchor_generator, RPNHead(in_channels=1024, num_anchors=rpn_anchor_generator.num_anchors_per_location()[0]), *rpn_kwargs ) # ------- Box ------- self.roi_align = MultiScaleRoIAlign( featmap_names=["C4"], output_size=(14, 7), sampling_ratio=0 ) representation_size = top.representation_size box_predictor = FastRCNNPredictor(representation_size, num_classes) box_kwargs = [ # Faster R-CNN training box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, # Faster R-CNN inference box_score_thresh, box_nms_thresh, box_detections_per_img ] embedding_head = ExtractReIDFeat( featmap_names=['C4', 'C5'] if cat_c4 else ['C5'], in_channels=[1024, 2048] if cat_c4 else [2048], dim=256 ) reid_loss = CriterionReID( cls_type, 256, num_train_pids ) feat_head = RCNNConvHead(top) self.roi_heads = RoIHeads( embedding_head, reid_loss, self.roi_align, feat_head, box_predictor, *box_kwargs ) self.roi_heads.cws = cws self.req_pid = -1 if cls_type == "oim" else 0 self.reid_time = 0 # ------- Misc ------- if image_mean is None: image_mean = [0.485, 0.456, 0.406] # NOTE: RGB order is given here if image_std is None: image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) self.eval_gt = eval_gt self.display = display
def __init__( self, backbone, num_classes=None, num_pids=5532, num_cq_size=5000, # transform parameters min_size=900, max_size=1500, image_mean=None, image_std=None, # Anchor settings: anchor_scales=None, anchor_ratios=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=12000, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters rcnn_bbox_bn=True, box_roi_pool=None, feat_head=None, box_predictor=None, box_score_thresh=0.0, box_nms_thresh=0.4, box_detections_per_img=300, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.1, box_batch_size_per_image=128, box_positive_fraction=0.5, bbox_reg_weights=None, # ReID parameters embedding_head=None, reid_loss=None): if not hasattr(backbone, "out_channels"): raise ValueError( 'backbone should contain an attribute out_channels ' 'specifying the number of output channels (assumed to be the ' 'same for all the levels)') assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( 'num_classes should be None when box_predictor is specified' ) else: if box_predictor is None: raise ValueError( 'num_classes should not be None when box_predictor' 'is not specified') out_channels = backbone.out_channels if rpn_anchor_generator is None: if anchor_scales is None: anchor_scales = ((32, 64, 128, 256, 512), ) if anchor_ratios is None: anchor_ratios = ((0.5, 1.0, 2.0), ) rpn_anchor_generator = AnchorGenerator(anchor_scales, anchor_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = self._set_rpn(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat_res4'], output_size=14, sampling_ratio=2) if feat_head is None: raise ValueError('feat_head should be specified manually.') if box_predictor is None: box_predictor = CoordRegressor(2048, num_classes, rcnn_bbox_bn) if embedding_head is None: embedding_head = NormAwareEmbeddingProj( featmap_names=['feat_res4', 'feat_res5'], in_channels=[1024, 2048], dim=256) if reid_loss is None: reid_loss = OIMLoss(256, num_pids, num_cq_size, 0.5, 30.0) roi_heads = self._set_roi_heads( embedding_head, reid_loss, box_roi_pool, feat_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN_NormAware, self).__init__(backbone, rpn, roi_heads, transform)
def __init__(self, correlation_args, batch_norm, conv_channels, n_box_channels, roi_output_size, avg_box_features, hidden_size, input_length, n_layers, dropout, correlation_only, use_env_features, fixed_env, correlation_last_only, sum_lstm_layers, refine_correlation, max_box_features, use_roi_align, use_pre_conv): super().__init__(correlation_args, batch_norm, conv_channels, n_box_channels, roi_output_size, avg_box_features, hidden_size, input_length, n_layers, dropout, correlation_only, use_env_features, fixed_env, correlation_last_only, sum_lstm_layers, max_box_features) assert correlation_args['stride'] == 1 self.refine_correlation = refine_correlation self.use_roi_align = use_roi_align self.use_pre_conv = use_pre_conv self.conv_reduce = conv(self.batch_norm, correlation_args['patch_size']**2, 32, kernel_size=1, stride=1) self.roi_output_size_ext = roi_output_size + ( (correlation_args['patch_size'] - 1) * correlation_args['dilation_patch']) self.roi_output_size_env = roi_output_size * 3 self.roi_output_size_env_ext = \ self.roi_output_size_env + ((correlation_args['patch_size'] - 1) * correlation_args['dilation_patch']) self.roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=self.roi_output_size, sampling_ratio=2) self.roi_pool_ext = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=self.roi_output_size_ext, sampling_ratio=2) self.roi_pool_env_ext = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=self.roi_output_size_env_ext, sampling_ratio=2) if self.fixed_env: locations_per_box = 1 if (self.avg_box_features or self.max_box_features) else ( roi_output_size * 3)**2 else: locations_per_box = 1 if ( self.avg_box_features or self.max_box_features) else roi_output_size**2 multiplier = 2 if self.use_env_features else 1 self.input_size = 6 + (n_box_channels * locations_per_box * multiplier) if self.use_pre_conv: self.pre_conv = conv(self.batch_norm, 256, 128) # layers inspired from https://github.com/ClementPinard/FlowNetPytorch/blob/master/models/FlowNetC.py self.conv_redir = conv(self.batch_norm, 256, 32, kernel_size=1, stride=1) in_planes = (self.correlation_args['patch_size']** 2) + (0 if self.correlation_only else 32) self.conv3_1 = conv(self.batch_norm, in_planes, self.conv_channels) self.conv4 = conv(self.batch_norm, self.conv_channels, self.conv_channels) self.conv4_1 = conv(self.batch_norm, self.conv_channels, self.n_box_channels) # recurrent layers self.encoder = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, num_layers=n_layers, dropout=dropout) self.attn = nn.Linear(self.hidden_size + self.input_size, self.input_length) self.attn_combine = nn.Linear(self.hidden_size + self.output_size, self.hidden_size) self.decoder = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, num_layers=n_layers, dropout=dropout) self.linear = nn.Linear( self.hidden_size if self.sum_lstm_layers else self.hidden_size * self.n_layers, self.output_size)
def __init__(self, backbone, num_classes=None, # transform parameter min_size=800, max_size=1333, # 预处理resize时限制的最小尺寸与最大尺寸 image_mean=None, image_std=None, # 预处理normalize时使用的均值和方差 # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # rpn中在nms处理前保留的proposal数(根据score) rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # rpn中在nms处理后保留的proposal数 rpn_nms_thresh=0.7, # rpn中进行nms处理时使用的iou阈值 rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # rpn计算损失时,采集正负样本设置的阈值 rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # rpn计算损失时采样的样本数,以及正负样本比例 # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, # 移除低目标概率 fast rcnn中进行nms处理的阈值 对预测结果根据score排序取前100个目标 box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, # fast rcnn计算误差时,采集正负样本设置的阈值 box_batch_size_per_image=512, box_positive_fraction=0.25, # fast rcnn计算误差时采样的样本数,以及正负样本比例 bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels" "specifying the number of output channels (assumed to be the" "same for all the levels" ) assert isinstance(rpn_anchor_generator, (AnchorsGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError("num_classes should be None when box_predictor " "is specified") else: if box_predictor is None: raise ValueError("num_classes should not be None when box_predictor " "is not specified") # 预测特征层的channels out_channels = backbone.out_channels # 若anchor生成器为空,则自动生成针对resnet50_fpn的anchor生成器 if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorsGenerator( anchor_sizes, aspect_ratios ) # 生成RPN通过滑动窗口预测网络部分 if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0] ) # 默认rpn_pre_nms_top_n_train = 2000, rpn_pre_nms_top_n_test = 1000, # 默认rpn_post_nms_top_n_train = 2000, rpn_post_nms_top_n_test = 1000, rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) # 定义整个RPN框架 rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) # Multi-scale RoIAlign pooling if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], # 在哪些特征层进行预测 output_size=[7, 7], sampling_ratio=2) # fast RCNN中roi pooling后的两个全连接层部分 if box_head is None: resolution = box_roi_pool.output_size[0] # 默认等于7 representation_size = 1024 box_head = TwoMLPHead( out_channels * resolution ** 2, representation_size ) # 在box_head的输出上预测部分 if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( representation_size, num_classes) # 将roi pooling, box_head以及box_predictor结合在一起 roi_heads = RoIHeads( # box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] # 对数据进行标准化,缩放,打包成batch等处理部分 transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
def training(args): # Random seed random.seed(42) # Device setting device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Data pre-setting dat = pd.read_csv(os.path.join(args.data_path, 'train_df.csv')) index_list = list(range(len(dat))) random.shuffle(index_list) valid_count = int(len(index_list) * args.split) train_df = dat.iloc[index_list[:-valid_count]] valid_df = dat.iloc[index_list[-valid_count:]] # Transform setting transforms_dict = { 'train': A.Compose([ A.ShiftScaleRotate( shift_limit=0.2, scale_limit=0.2, rotate_limit=30, p=0.3), A.HorizontalFlip(p=0.3), A.RGBShift( r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.3), A.RandomBrightnessContrast(p=0.3), ToTensorV2() ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']), keypoint_params=A.KeypointParams(format='xy', remove_invisible=False, angle_in_degrees=True)), 'valid': A.Compose([ToTensorV2()], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']), keypoint_params=A.KeypointParams(format='xy', remove_invisible=False, angle_in_degrees=True)) } # PyTorch dataloader setting dataset_dict = { 'train': KeypointDataset(os.path.join(args.data_path, 'train_imgs/'), train_df, transforms_dict['train']), 'valid': KeypointDataset(os.path.join(args.data_path, 'train_imgs/'), valid_df, transforms_dict['valid']), } dataloader_dict = { 'train': DataLoader(dataset_dict['train'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn), 'valid': DataLoader(dataset_dict['valid'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn), } # Model setting backbone = resnet_fpn_backbone('resnet101', pretrained=True) roi_pooler = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) keypoint_roi_pooler = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=14, sampling_ratio=2) model = KeypointRCNN(backbone, num_classes=2, num_keypoints=24, box_roi_pool=roi_pooler, keypoint_roi_pool=keypoint_roi_pooler) model = model.to(device) # Optimizer setting optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.w_decay) scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=len(dataloader_dict['train']) / 1.5) # Resume start_epoch = 0 if args.resume: print('resume!') checkpoint = torch.load(args.file_name, map_location='cpu') start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) model = model.to(device) # Train start best_val_rmse = None for epoch in range(start_epoch, args.num_epochs): for phase in ['train', 'valid']: if phase == 'train': model.train() if phase == 'valid': print('Validation start...') model.eval() val_rmse = 0 for i, (images, targets) in enumerate(tqdm(dataloader_dict[phase])): # Optimizer setting optimizer.zero_grad() # Input, output setting images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] with torch.set_grad_enabled(phase == 'train'): losses = model(images, targets) if phase == 'train': loss = sum(loss for loss in losses.values()) loss.backward() clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() if (i + 1) % 100 == 0: print( f'| epoch: {epoch} | lr: {optimizer.param_groups[0]["lr"]} | loss: {loss.item():.4f}', end=' | ') for k, v in losses.items(): print(f'{k[5:]}: {v.item():.4f}', end=' | ') print() if phase == 'valid': for i, l in enumerate(losses): pred_ = l['keypoints'][0][:, :2].detach().cpu( ).numpy().reshape(-1) target_ = targets[i]['keypoints'][0][:, :2].cpu( ).numpy().reshape(-1) val_rmse += np.sqrt(((pred_ - target_)**2).mean()) if phase == 'valid': val_rmse /= len(dataloader_dict[phase]) print(f'Validation RMSE: {val_rmse}') if not best_val_rmse or val_rmse < best_val_rmse: print('Checkpoint saving...') torch.save( { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, args.file_name) best_val_rmse = val_rmse
def __init__(self, out_channels, num_classes, input_mode, acf_head, fg_iou_thresh=0.5, bg_iou_thresh=0.5, batch_size_per_image=512, positive_fraction=0.25, bbox_reg_weights=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100): super(RoIHeadsExtend, self).__init__() self.in_channels = out_channels self.input_mode = input_mode self.score_thresh = box_score_thresh self.nms_thresh = box_nms_thresh self.detections_per_img = box_detections_per_img self.fg_iou_thresh = fg_iou_thresh self.bg_iou_thresh = bg_iou_thresh self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction self.num_classes = num_classes # Detection self.box_similarity = box_ops.box_iou # assign ground-truth boxes for each proposal self.proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( batch_size_per_image, positive_fraction) if bbox_reg_weights is None: bbox_reg_weights = (10., 10., 5., 5.) self.box_coder = det_utils.BoxCoder(bbox_reg_weights) self.box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) representation_size = 1024 resolution = self.box_roi_pool.output_size[0] self.box_head = TwoMLPHead(out_channels * resolution**2, representation_size) self.box_predictor = FastRCNNPredictor(representation_size, num_classes) # Segmentation self.shared_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=14, sampling_ratio=2) resolution = self.shared_roi_pool.output_size[0] mask_layers = (256, 256, 256, 256, 256, 256, 256, 256) mask_dilation = 1 self.mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) mask_predictor_in_channels = 256 # == mask_layers[-1] mask_dim_reduced = 256 self.mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, num_classes) self.with_paf_branch = True if self.with_paf_branch: self.paf_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) self.paf_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, 2 * (num_classes - 1)) if self.input_mode == config.INPUT_RGBD: self.attention_block = ContextBlock(256, 2) self.global_feature_dim = 256 self.with_3d_keypoints = True self.with_axis_keypoints = False self.regress_axis = False self.estimate_norm_vector = False if acf_head == 'endpoints': self.with_axis_keypoints = True elif acf_head == 'scatters': self.regress_axis = True elif acf_head == 'norm_vector': self.estimate_norm_vector = True else: print("Don't assign a vaild acf head") exit() keypoint_layers = (256, ) * 4 self.keypoint_dim_reduced = keypoint_layers[-1] if self.with_3d_keypoints: self.vote_keypoint_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.vote_keypoint_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 3 * (num_classes - 1)) if self.with_axis_keypoints: self.orientation_keypoint_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.orientation_keypoint_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 6 * (num_classes - 1)) if self.regress_axis: self.axis_head = Vote_Kpoints_head(self.global_feature_dim, keypoint_layers, "conv2d") self.axis_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 4 * (num_classes - 1)) if self.estimate_norm_vector: self.norm_vector_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.norm_vector_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 3 * (num_classes - 1))
def __init__( self, backbone, num_ID, num_classes=2, len_embeddings=128, # transform parameters min_size=720, max_size=960, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 if box_predictor is None: representation_size = 1024 box_predictor = JDEPredictor(representation_size, num_classes, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) self.eval_embed = False
def __init__(self, config): super(Mask_Rcnn, self).__init__() self.config = config self.Mean = torch.tensor(config.Mean, dtype=torch.float32) self.num_anchor = len(config.anchor_scales) * len(config.anchor_ratios) self.anchors = [] self.num_anchor = [] for i in range(5): self.num_anchor.append( len(config.anchor_scales[i]) * len(config.anchor_ratios[i])) stride = 4 * 2**i print(stride, self.config.anchor_scales[i], self.config.anchor_ratios[i]) anchors = get_anchors(np.ceil(self.config.img_max / stride + 1), self.config.anchor_scales[i], self.config.anchor_ratios[i], stride=stride) print(anchors.shape) self.anchors.append(anchors) self.ATC = AnchorTargetCreator( n_sample=config.rpn_n_sample, pos_iou_thresh=config.rpn_pos_iou_thresh, neg_iou_thresh=config.rpn_neg_iou_thresh, pos_ratio=config.rpn_pos_ratio) self.PC = ProposalCreator(nms_thresh=config.roi_nms_thresh, n_train_pre_nms=config.roi_train_pre_nms, n_train_post_nms=config.roi_train_post_nms, n_test_pre_nms=config.roi_test_pre_nms, n_test_post_nms=config.roi_test_post_nms, min_size=config.roi_min_size) self.PTC_1 = ProposalTargetCreator_box( n_sample=config.fast_n_sample, pos_ratio=config.fast_pos_ratio, pos_iou_thresh=config.fast_pos_iou_thresh, neg_iou_thresh_hi=config.fast_neg_iou_thresh_hi, neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo) self.PTC_2 = ProposalTargetCreator_box( n_sample=config.fast_n_sample, pos_ratio=config.fast_pos_ratio, pos_iou_thresh=0.6, neg_iou_thresh_hi=0.6, neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo) self.PTC = ProposalTargetCreator( n_sample=config.fast_n_sample, pos_ratio=config.fast_pos_ratio, pos_iou_thresh=0.7, neg_iou_thresh_hi=0.7, neg_iou_thresh_lo=config.fast_neg_iou_thresh_lo) self.features = resnet50() self.fpn = FPN_net([256, 512, 1024, 2048], 256, extra_blocks=LastLevelMaxPool()) self.rpn = RPN_net(256, self.num_anchor[0]) self.roialign_7 = MultiScaleRoIAlign( ['feat0', 'feat1', 'feat2', 'feat3'], 7, 2) self.roialign_14 = MultiScaleRoIAlign( ['feat0', 'feat1', 'feat2', 'feat3'], 14, 2) # self.roialign_28 = RoIAlign((28, 28), 1.0, 2) self.fast = Fast_net(config.num_cls, 256 * 7 * 7, 1024) self.fast_2 = Fast_net(config.num_cls, 256 * 7 * 7, 1024) self.fast_3 = Fast_net(config.num_cls, 256 * 7 * 7, 1024) self.mask_net = Mask_net(256, config.num_cls) self.a = 0 self.b = 0 self.c = 0 self.d = 0 self.fast_num = 0 self.fast_num_P = 0 self.loc_std1 = [1. / 10, 1. / 10, 1. / 5, 1. / 5] self.loc_std2 = [1. / 20, 1. / 20, 1. / 10, 1. / 10] self.loc_std3 = [1. / 30, 1. / 30, 1. / 15, 1. / 15] self.loss_weights = [1.0, 0.5, 0.25]
def __init__( self, object_to_action, human_idx, # Backbone parameters backbone_name="resnet50", pretrained=True, # Pooler parameters output_size=7, sampling_ratio=2, # Box pair head parameters node_encoding_size=1024, representation_size=1024, num_classes=117, fg_iou_thresh=0.5, num_iterations=1, # Transformation parameters min_size=800, max_size=1333, image_mean=None, image_std=None, postprocess=True, # Preprocessing parameters box_nms_thresh=0.5, max_human=15, max_object=15): backbone = models.fasterrcnn_resnet_fpn(backbone_name, pretrained=pretrained).backbone box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=output_size, sampling_ratio=sampling_ratio) box_pair_head = GraphHead( out_channels=backbone.out_channels, roi_pool_size=output_size, node_encoding_size=node_encoding_size, representation_size=representation_size, num_cls=num_classes, human_idx=human_idx, object_class_to_target_class=object_to_action, fg_iou_thresh=fg_iou_thresh, num_iter=num_iterations) box_pair_predictor = nn.Linear(representation_size * 2, num_classes) interaction_head = InteractionHead( box_roi_pool=box_roi_pool, box_pair_head=box_pair_head, box_pair_predictor=box_pair_predictor, num_classes=num_classes, human_idx=human_idx, box_nms_thresh=box_nms_thresh, max_human=max_human, max_object=max_object) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = HOINetworkTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, interaction_head, transform, postprocess)
def __init__( self, backbone, num_classes=2, # Faster and Mask R-CNN min_size=512, max_size=512, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=400, rpn_pre_nms_top_n_test=400, rpn_post_nms_top_n_train=200, rpn_post_nms_top_n_test=200, rpn_nms_thresh=0.75, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.75, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.75, box_bg_iou_thresh=0.5, box_batch_size_per_image=256, box_positive_fraction=0.75, bbox_reg_weights=None, # Mask parameters mask_roi_pool=None, mask_head=None, mask_predictor=None, # Alex - SSM box_score_thresh_classifier=-0.01, box_nms_thresh_classifier=0.25, box_detections_per_img_s2new=8, # Alex - Mask+Box Features extractor, box_pool_s2=None, box_head_s2=None, box_predictor_s2=None, mask_pool_s2=None, mask_head_s2=None, mask_predictor_s2=None, # Alex - Affinity model x_stages=3, num_classes_img=3, sieve_layer=None, s2classifier=None, num_affinities=256, affinity=None, s2new_classifier=None, **kwargs): out_channels = backbone.out_channels # Mask features branch # Classification branch if box_pool_s2 is None: box_pool_s2 = MultiScaleRoIAlign( # single feature map featmap_names=['0'], output_size=7, sampling_ratio=2) if box_head_s2 is None: resolution = box_pool_s2.output_size[0] representation_size = 128 box_head_s2 = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor_s2 is None: representation_size = 128 box_predictor_s2 = FastRCNNPredictor(representation_size, num_classes) if mask_pool_s2 is None: mask_pool_s2 = MultiScaleRoIAlign( #Alex: the key of the feature map featmap_names=['0'], output_size=14, sampling_ratio=2) if mask_head_s2 is None: mask_layers = (out_channels, ) mask_dilation = 1 mask_head_s2 = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) # add mask predictor: upsample+bn+relu if mask_predictor_s2 is None: in_channels = mask_head_s2[-2].out_channels out_channels = in_channels mask_predictor_s2 = MaskRCNNPredictorTruncated( in_channels, out_channels, mask_dilation) # Affinity layer, num_feature_maps = mask_predictor_s2.conv_reduce.out_channels num_reduce_feature_maps = int(num_feature_maps / 2) if sieve_layer is None: sieve_layer = MaskFeaturesSieve( num_feature_maps=num_feature_maps, num_reduce_feature_maps=num_reduce_feature_maps, h=28, w=28, apply_linearity=False, final=False) affinity_layer = AffinityLayer( sieve_layer, affinity_matrix_size=box_detections_per_img_s2new, x_stages=x_stages, num_features=num_feature_maps, num_affinities=num_affinities) # Image classification batch if s2classifier is None: s2classifier = ImageClassificationLayerFromMaskFeatures( affinity_feature_size=num_feature_maps, num_classes_img=num_classes_img) # instantiate Mask R-CNN: # affinity and image classificiaotn module will be passed to the Generalized RCNN kwargs.update(affinity=affinity_layer, s2new_classifier=s2classifier) super(AffinityModel, self).__init__( backbone, num_classes, # transform parameters min_size, max_size, image_mean, image_std, # RPN parameters rpn_anchor_generator, rpn_head, rpn_pre_nms_top_n_train, rpn_pre_nms_top_n_test, rpn_post_nms_top_n_train, rpn_post_nms_top_n_test, rpn_nms_thresh, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, # Box parameters box_roi_pool, box_head, box_predictor, box_score_thresh, box_nms_thresh, box_detections_per_img, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, # Mask parameters mask_roi_pool=None, mask_head=None, mask_predictor=None, **kwargs) # Alex - SSM # self.roi_heads.score_thresh_classifier = box_score_thresh_classifier self.roi_heads.nms_thresh_classifier = box_nms_thresh_classifier self.roi_heads.detections_per_img_s2new = box_detections_per_img_s2new # # self.roi_heads.box_pool_s2 = box_pool_s2 self.roi_heads.box_head_s2 = box_head_s2 self.roi_heads.box_predictor_s2 = box_predictor_s2 # # Alex - Mask Features extractor, self.roi_heads.mask_pool_s2 = mask_pool_s2 self.roi_heads.mask_head_s2 = mask_head_s2 self.roi_heads.mask_predictor_s2 = mask_predictor_s2
def __init__( self, config, # backbone, # neck, # head, min_size=800, max_size=1333, preserve_aspect_ratio=True, rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_score_thresh=0.0, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, anchor_sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0) ) -> None: super().__init__(config) self.num_classes = 91 self.preserve_aspect_ratio = preserve_aspect_ratio self.transform = RcnnTransform(800, 1333, None, None) self.backbone = resnet50() self.neck = FasterRcnnNeck(config, self.backbone.channels) anchor_sizes = tuple((anchor,) for anchor in anchor_sizes) aspect_ratios = (aspect_ratios,) * len(anchor_sizes) print(anchor_sizes) print(aspect_ratios) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios ) out_channels = self.neck.channels[-1] rpn_head = RpnHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0] ) rpn_pre_nms_top_n = { 'training': rpn_pre_nms_top_n_train, 'testing': rpn_pre_nms_top_n_test } rpn_post_nms_top_n = { 'training': rpn_post_nms_top_n_train, 'testing': rpn_post_nms_top_n_test } self.rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, score_thresh=rpn_score_thresh ) box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2 ) resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = LinearHead( out_channels * resolution ** 2, representation_size ) representation_size = 1024 box_predictor = FastRcnnPredictHead( representation_size, self.num_classes ) self.roi_heads = RoiHeads( box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img )
def __init__( self, object_to_action: List[list], human_idx: int, # Backbone parameters backbone_name: str = "resnet50", pretrained: bool = True, # Pooler parameters output_size: int = 7, sampling_ratio: int = 2, # Box pair head parameters node_encoding_size: int = 1024, representation_size: int = 1024, num_classes: int = 117, box_score_thresh: float = 0.2, fg_iou_thresh: float = 0.5, num_iterations: int = 2, distributed: bool = False, # Transformation parameters min_size: int = 800, max_size: int = 1333, image_mean: Optional[List[float]] = None, image_std: Optional[List[float]] = None, postprocess: bool = True, # Preprocessing parameters box_nms_thresh: float = 0.5, max_human: int = 15, max_object: int = 15) -> None: detector = models.fasterrcnn_resnet_fpn(backbone_name, pretrained=pretrained) backbone = detector.backbone box_roi_pool = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=output_size, sampling_ratio=sampling_ratio) box_pair_head = GraphHead( out_channels=backbone.out_channels, roi_pool_size=output_size, node_encoding_size=node_encoding_size, representation_size=representation_size, num_cls=num_classes, human_idx=human_idx, object_class_to_target_class=object_to_action, fg_iou_thresh=fg_iou_thresh, num_iter=num_iterations) box_pair_predictor = nn.Linear(representation_size * 2, num_classes) box_pair_suppressor = nn.Linear(representation_size * 2, 1) interaction_head = InteractionHead( box_roi_pool=box_roi_pool, box_pair_head=box_pair_head, box_pair_suppressor=box_pair_suppressor, box_pair_predictor=box_pair_predictor, num_classes=num_classes, human_idx=human_idx, box_nms_thresh=box_nms_thresh, box_score_thresh=box_score_thresh, max_human=max_human, max_object=max_object, distributed=distributed) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = HOINetworkTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, interaction_head, transform, postprocess)
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) rpn_anchor_generator = task.rpn_anchor_generator rpn_head = task.rpn_head box_roi_pool = task.box_roi_pool box_predictor = task.box_predictor box_head = task.box_head # setup backbone backbone = resnet_fpn_backbone(args.backbone, args.backbone_pretrained) if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)" ) assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if task.num_classes > 0: if box_predictor is not None: raise ValueError("num_classes should be -1 when box_predictor is specified") else: if box_predictor is None: raise ValueError("num_classes should be > 0 when box_predictor is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0], ) rpn_pre_nms_top_n = dict(training=args.rpn_pre_nms_top_n_train, testing=args.rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=args.rpn_post_nms_top_n_train, testing=args.rpn_post_nms_top_n_test) rpn = RPN( rpn_anchor_generator, rpn_head, args.rpn_fg_iou_thresh, args.rpn_bg_iou_thresh, args.rpn_batch_size_per_image, args.rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, args.rpn_nms_thresh, ) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2, ) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead( out_channels * resolution ** 2, representation_size, ) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( representation_size, task.num_classes, ) roi_heads = RegionOfInterestHeads( # Box box_roi_pool, box_head, box_predictor, args.box_fg_iou_thresh, args.box_bg_iou_thresh, args.box_batch_size_per_image, args.box_positive_fraction, args.bbox_reg_weights, args.box_score_thresh, args.box_nms_thresh, args.box_detections_per_img, ) if args.image_mean is None: args.image_mean = [0.485, 0.456, 0.406] if args.image_std is None: args.image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform( args.min_size, args.max_size, args.image_mean, args.image_std, ) return cls(backbone, rpn, roi_heads, transform)
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Mask parameters mask_roi_pool=None, mask_head=None, mask_predictor=None): assert isinstance(mask_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if mask_predictor is not None: raise ValueError( "num_classes should be None when mask_predictor is specified" ) out_channels = backbone.out_channels if mask_roi_pool is None: mask_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=14, sampling_ratio=2) if mask_head is None: mask_layers = (256, 256, 256, 256) mask_dilation = 1 mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) if mask_predictor is None: mask_predictor_in_channels = 256 # == mask_layers[-1] mask_dim_reduced = 256 mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, num_classes) super(MaskRCNNIA, self).__init__( backbone, num_classes, # transform parameters min_size, max_size, image_mean, image_std, # RPN-specific parameters rpn_anchor_generator, rpn_head, rpn_pre_nms_top_n_train, rpn_pre_nms_top_n_test, rpn_post_nms_top_n_train, rpn_post_nms_top_n_test, rpn_nms_thresh, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, # Box parameters box_roi_pool, box_head, box_predictor, box_score_thresh, box_nms_thresh, box_detections_per_img, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights) self.roi_heads.mask_roi_pool = mask_roi_pool self.roi_heads.mask_head = mask_head self.roi_heads.mask_predictor = mask_predictor
def __init__( self, num_classes, # re-ID num_train_pids, cls_type="", in_level=["C5"], # Transform min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Misc eval_gt=False, display=False, cws=False): super(BSL, self).__init__() # ------- Backbone ------- base_model, top_model = _split_backbone(backbone_name='resnet50', conv5_stride=2) return_layers = { 'conv1': "C1", 'conv2': "C2", 'conv3': "C3", 'conv4_3': "C4", } self.backbone = DetectorBackbone(base_model, return_layers) # ------- RPN ------- rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn_kwargs = [ rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh ] rpn_anchor_generator = AnchorGenerator(sizes=((8, 16, 32), ), aspect_ratios=((1, 2), )) self.RPN = RegionProposalNetwork( rpn_anchor_generator, RPNHead(1024, rpn_anchor_generator.num_anchors_per_location()[0]), *rpn_kwargs) # ------- R-CNN ------- roi_align = MultiScaleRoIAlign(featmap_names=["C4"], output_size=(14, 7), sampling_ratio=0) resolution_h, resolution_w = roi_align.output_size[ 0], roi_align.output_size[1] box_emb = EmbDet(1024, 256, resolutions=[resolution_h, resolution_w]) box_predictor = FastRCNNPredictor(box_emb.representation_size, num_classes) box_kwargs = [ # Faster R-CNN training box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, # Faster R-CNN inference box_score_thresh, box_nms_thresh, box_detections_per_img ] self.RCNN = RCNN(roi_align, box_emb, box_predictor, *box_kwargs) self.RCNN.cws = cws # ------- re-ID ------- out_channels = 256 in_ch_list = [2048, 1024, 512, 256, 256][:len(in_level)][::-1] reid_emb = EmbedReID(top_model, roi_align, featmap_names=in_level, in_ch_list=in_ch_list, out_ch=out_channels) reid_crit = nn.ModuleDict() for name, in_ch in zip(in_level, in_ch_list): reid_crit[name] = CriterionReID(cls_type, in_ch, num_train_pids) self.reid_head = ReIDHead( reid_emb, reid_crit, # PK sampling n_roi_per_gt=4, fg_iou_thresh=0.5) # -------- Others ------- if image_mean is None: image_mean = [0.485, 0.456, 0.406] # NOTE: RGB order is given here if image_std is None: image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) self.eval_gt = eval_gt self.display = display
def __init__( self, backbone, num_classes=None, # transform parameters min_size=None, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_score_thresh=0.0, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # keypoint parameters keypoint_roi_pool=None, keypoint_head=None, keypoint_predictor=None, num_keypoints=None, ): if not isinstance(keypoint_roi_pool, (MultiScaleRoIAlign, type(None))): raise TypeError( "keypoint_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(keypoint_roi_pool)}" ) if min_size is None: min_size = (640, 672, 704, 736, 768, 800) if num_keypoints is not None: if keypoint_predictor is not None: raise ValueError( "num_keypoints should be None when keypoint_predictor is specified" ) else: num_keypoints = 17 out_channels = backbone.out_channels if keypoint_roi_pool is None: keypoint_roi_pool = MultiScaleRoIAlign( featmap_names=["0", "1", "2", "3"], output_size=14, sampling_ratio=2) if keypoint_head is None: keypoint_layers = tuple(512 for _ in range(8)) keypoint_head = KeypointRCNNHeads(out_channels, keypoint_layers) if keypoint_predictor is None: keypoint_dim_reduced = 512 # == keypoint_layers[-1] keypoint_predictor = KeypointRCNNPredictor(keypoint_dim_reduced, num_keypoints) super().__init__( backbone, num_classes, # transform parameters min_size, max_size, image_mean, image_std, # RPN-specific parameters rpn_anchor_generator, rpn_head, rpn_pre_nms_top_n_train, rpn_pre_nms_top_n_test, rpn_post_nms_top_n_train, rpn_post_nms_top_n_test, rpn_nms_thresh, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_score_thresh, # Box parameters box_roi_pool, box_head, box_predictor, box_score_thresh, box_nms_thresh, box_detections_per_img, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, ) self.roi_heads.keypoint_roi_pool = keypoint_roi_pool self.roi_heads.keypoint_head = keypoint_head self.roi_heads.keypoint_predictor = keypoint_predictor
def __init__( self, backbone, num_ID, num_classes=2, version='v1', # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, #FIXME 这两个参数是参照论文Towards Real-Time Multi-Object Tracking rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=256, box_positive_fraction=0.25, bbox_reg_weights=None, # Embedding parameters ##FIXME 添加的参数 len_embeddings=128, embed_head=None, embed_extractor=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels ##FIXME 改了anchor size,并且只使用宽高比1/3的anchor,参考了Towards Real-Time Multi-Object Tracking if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=11, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 ## FIXME 现在用的是v1 if embed_head is None: if version == 'v1': resolution = box_roi_pool.output_size[0] representation_size = 1024 embed_head = featureHead(out_channels * resolution**2, representation_size) if version == 'v2': embed_head = None if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if embed_extractor is None: representation_size = 1024 embed_extractor = featureExtractor(representation_size, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID, embed_head, embed_extractor) roi_heads.version = version #FIXME 这一部分是照搬faster RCNN代码里面的################### if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) ########################################################### super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) ## FIXME 跟踪时用的参数,与训练无关 self.version = version self.original_image_sizes = None self.preprocessed_images = None self.features = None self.box_features = None