def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M): """ Given segmentation masks and the bounding boxes corresponding to the location of the masks in the image, this function crops and resizes the masks in the position defined by the boxes. This prepares the masks for them to be fed to the loss computation as the targets. """ matched_idxs = matched_idxs.to(boxes) rois = torch.cat([matched_idxs[:, None], boxes], dim=1) gt_masks = gt_masks[:, None].to(rois) return roi_align(gt_masks, rois, (M, M), 1)[:, 0]
def get_yolo_feature_vec(self, coords): feature_map = self.get_feature_map() ratio = self.img_size/feature_map.size()[2] #coords = (10,10,100,100) coords = torch.cat((torch.Tensor([0]),torch.Tensor(coords))).view(1,5).cuda() #coords = torch.Tensor(coords).view(1,4).cuda() #print(feature_map.shape) #print(coords.shape) #print(coords.shape) with torch.no_grad(): roi = roi_align( feature_map, coords,(3,3) , spatial_scale=1/ratio) #print(roi) vec = F.adaptive_avg_pool2d(roi, (1, 1)) return np.squeeze(vec.cpu().detach().numpy())
def __call__(self, feats, masks_to_concat, num_obj): out_instace_features = [] masks_to_concat = F.interpolate(masks_to_concat.unsqueeze(0), size=feats.shape[-2:]) for obj_idx in range(num_obj): obj_mask = masks_to_concat[:, obj_idx, ...] bbx = extract_bboxes(obj_mask, self.dilate) instance_features = roi_align(feats, bbx, self.spatial_size) out_instace_features.append(instance_features) return [ torch.cat((feats, masks_to_concat[:, obj_idx, ...]), dim=0).unsqueeze(0) for obj_idx in range(num_obj) ]
def forward(self, input, targets): boxes = targets[:, [0, 2, 3, 4, 5]] _, _, h, w = input.shape boxes[:, [2, 4]] *= h boxes[:, [1, 3]] *= w o_h, o_w = torch.mean(boxes[:, 4]), torch.mean(boxes[:, 3]) boxes[:, 1:] = xywh2xyxy(boxes[:, 1:]) feat = roi_align(input, boxes, output_size=(o_h, o_w)) out = self.bn(feat) out = self.act(out) out = self.pooling(out) out = self.linear(out.squeeze()) return out
def extract_features(args): model = get_model(args) # model = RoiModel() # model = nn.DataParallel(model) # model = model.cuda() # model.eval() bboxes, keys = get_bbox(args) loader = get_dataloader(args, keys) N = len(loader.dataset) # out_channels = 2048 if args.arch == 'resnet152' else 512 if args.arch == 'resnet152': out_channels = 2048 elif args.arch == 'vgg16': out_channels = 512 fp = open_memmap( os.path.join(args.output_dir, 'data.npy'), mode='w+', dtype=np.float32, shape=(N, args.num_boxes, out_channels) ) with torch.no_grad(): for i, images in tqdm(enumerate(loader), total=len(loader)): images = images.cuda() output = model(images) current_batch_size = images.shape[0] current_index = i * args.batch_size current_boxes = bboxes[current_index: current_index + current_batch_size] current_boxes = [b.cuda() for b in current_boxes] output = roi_align(output, current_boxes, (1, 1)) # index = i * args.batch_size # import ipdb; ipdb.set_trace() fp[current_index: current_index + current_batch_size] = output.view( current_batch_size, args.num_boxes, out_channels).cpu().numpy() print(fp[N - 1]) del fp loader.dataset.save_indices(args.output_dir)
def forward(self, input, rois): """ Args: input: NCHW images rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. """ assert rois.dim() == 2 and rois.size(1) == 5 return roi_align( input, rois.to(dtype=input.dtype), self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned, )
def crop_tracklets(self, boxes, frame, ber=None): """ Crops relevant areas from frame based on a priori (pre_locations) object locations """ if ber is None: ber = self.ber #box_ids = [] #box_list = [] # # convert to array # for id in pre_locations: # box_ids.append(id) # box_list.append(pre_locations[id][:4]) # boxes = np.array(box_list) # boxes = pre_locations temp = np.zeros(boxes.shape) temp[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 temp[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 temp[:, 2] = boxes[:, 2] - boxes[:, 0] temp[:, 3] = (boxes[:, 3] - boxes[:, 1]) / (temp[:, 2] + 1e-07) boxes = temp # first row of zeros is batch index (batch is size 0) for ROI align new_boxes = np.zeros([len(boxes), 5]) # use either s or s x r for both dimensions, whichever is smaller,so crop is square box_scales = np.max(np.stack((boxes[:, 2], boxes[:, 2] * boxes[:, 3]), axis=1), axis=1) #/2.0 #expand box slightly box_scales = box_scales * ber # box expansion ratio new_boxes[:, 1] = boxes[:, 0] - box_scales / 2 new_boxes[:, 3] = boxes[:, 0] + box_scales / 2 new_boxes[:, 2] = boxes[:, 1] - box_scales / 2 new_boxes[:, 4] = boxes[:, 1] + box_scales / 2 torch_boxes = torch.from_numpy(new_boxes).float().to(self.device) # crop using roi align crops = roi_align(frame.unsqueeze(0), torch_boxes, (self.cs, self.cs)) return crops, new_boxes, box_scales
def extract_patch_from_frame(image, coordinates, output_shape): """ This should be the inverse operation to translate_and_scale_patch """ # Translate from coordinates in (x_center, y_center, w, h) to (minx, miny, maxx, maxy) xyxy = torch.zeros_like(coordinates) xyxy[:, 0] = coordinates[:, 0] - image.shape[-2] * coordinates[:, 2] / 2 xyxy[:, 1] = coordinates[:, 1] - image.shape[-1] * coordinates[:, 3] / 2 xyxy[:, 2] = coordinates[:, 0] + image.shape[-2] * coordinates[:, 2] / 2 xyxy[:, 3] = coordinates[:, 1] + image.shape[-1] * coordinates[:, 3] / 2 xyxy_with_index = torch.cat( (torch.arange(xyxy.shape[0], dtype=xyxy.dtype, device=xyxy.device).view(-1, 1), xyxy), dim=1) patches = roi_align(image, xyxy_with_index, output_shape, aligned=True) return patches
def MultiScaleRoiAlign(self, fpn_feat_list, proposals, P=7): ##################################### # Here you can use torchvision.ops.RoIAlign check the docs ##################################### proposals = torch.cat(proposals) total_proposals = len(proposals) x1, y1, x2, y2 = proposals.T p_width = torch.abs(x1 - x2) p_height = torch.abs(y1 - y2) fpn_idx = ( 4 + torch.log2(torch.sqrt(p_width * p_height) / 224)).floor().clamp( min=2, max=5) #k-values are clipped to [2,5] according to piazza fpn_idx -= 2 feature_vectors = torch.zeros( (len(proposals), 256 * P * P)).to(self.device) for i in range(len(fpn_feat_list)): matched_proposals_idx = torch.where(fpn_idx == i) img_id = matched_proposals_idx[0] // 200 matched_proposals = proposals[matched_proposals_idx] matched_proposals = torch.cat( (img_id.unsqueeze(dim=1), matched_proposals), dim=1) # convert proposal box from img coord to featuremap coord # matched_proposals *= fpn_feat_list[i].shape[-1]/ 1088 matched_proposals[:, ( 1, 3 )] *= fpn_feat_list[i].shape[-1] / 1088 # rescaling the x-coords matched_proposals[:, ( 2, 4 )] *= fpn_feat_list[i].shape[-2] / 800 # rescaling the y-coords aligned_box = ops.roi_align( fpn_feat_list[i], boxes=matched_proposals, output_size=P) #shape: #proposals in feature level i, 256*p*p aligned_box = torch.flatten(aligned_box, -3, -1) feature_vectors[matched_proposals_idx] = aligned_box.to( self.device) # assert feature_vectors.shape[0] == total_proposals return feature_vectors
def get_local_features(self, features, boxes, picture_width, picture_height): features_heights = features.shape[3] features_width = features.shape[2] boxes_copy = boxes.clone() boxes_copy[:, 0] = (boxes_copy[:, 0]*features_heights) / picture_height boxes_copy[:, 2] = (boxes_copy[:, 2]*features_heights) / picture_height boxes_copy[:, 1] = (boxes_copy[:, 1]*features_width) / picture_width boxes_copy[:, 3] = (boxes_copy[:, 3]*features_width) / picture_width batch = torch.arange(boxes_copy.shape[0]).unsqueeze(1).cuda().float() box_input = torch.cat((batch, boxes_copy), dim=1) roi_align_output = ops.roi_align(features, box_input, (1,1)).squeeze() roi_align_output[boxes[:, 0] == -1, :] = F.adaptive_avg_pool2d(features, (1,1)).squeeze()[boxes[:, 0] == -1, :] roi_align_output= roi_align_output.squeeze() if len(roi_align_output.shape) == 1: roi_align_output = roi_align_output.unsqueeze(0) return roi_align_output
def project_masks_on_boxes(true_masks: Tensor, boxes: Tensor, matched_indexes: Tensor, M: int) -> Tensor: """ Given segmentation masks and the bounding boxes corresponding to the location of the masks in the image, this function crops and resizes the masks in the position defined by the boxes. This prepares the masks for them to be fed to the loss computation as the targets. Args: true_masks boxes matched_indexes M?: output size """ matched_indexes = matched_indexes.to(boxes) rois = torch.cat([matched_indexes[:, None], boxes], dim=1) true_masks = true_masks[:, None].to(rois) return roi_align(true_masks, rois, (M, M), 1.)[:, 0]
def adaptive_feature_pooling(scaled_features, rois): base_size = 28 roi_size = 14 first_f = True for i in range(len(scaled_features) - 1, -1, -1): rois = rois * 2**i * base_size rois_pool_temp = ops.roi_align(input=scaled_features[i], boxes=[rois], output_size=roi_size) if first_f: rois_pool = rois_pool_temp first_f = False else: rois_pool = torch.maximum(rois_pool, rois_pool_temp) del rois_pool_temp return rois_pool
def __call__(self, pred__, feature): # detections with shape: nx6 (x1, y1, x2, y2, conf, cls) # bs = len(pr) # ig = (feature[0][0].permute(1,2,0).cpu().detach().numpy()*255).astype(np.int) # im_name = str(time.time())+'.jpg' # cv2.imwrite('yanzheng/'+im_name,ig) feature = feature[0] preds = non_max_suppression_refine(pred__, self.conf_thres, self.iou_thres, classes=None) # print(self.conf_thres, self.iou_thres) pic_w, pic_h = feature.shape[2], feature.shape[3] boxes = [] for i, pred_ in enumerate(preds): # print(torch.max(pred_[:,4]),pred_[0,4:]) pred = pred_[:, 0:4] if pred.shape[0] > 20: pred = pred[0:20, :] # pred = torch.tensor([[0,160,160,320]]).to(feature.device).float() prd = torch.zeros_like(pred).to(feature.device) if pred.shape[0] != 0: w = (pred[:, 2] - pred[:, 0]).unsqueeze(0) h = (pred[:, 3] - pred[:, 1]).unsqueeze(0) c_y = (pred[:, 3] + pred[:, 1]) / 2 c_x = (pred[:, 2] + pred[:, 0]) / 2 wh = torch.cat((w, h), 0) wh = torch.max(wh, 0)[0] * 2 prd[:, 3] = (c_y + wh / 2).clamp(0, pic_h - 1) prd[:, 1] = (c_y - wh / 2).clamp(0, pic_h - 1) prd[:, 2] = (c_x + wh / 2).clamp(0, pic_w - 1) prd[:, 0] = (c_x - wh / 2).clamp(0, pic_w - 1) # pred[] # pred = torch.tensor([[20.,20.,100.,100.]]).to(device) # print(pred.shape) boxes.append(prd) per_fear = ops.roi_align(feature, boxes, [32, 32]) # ig = (per_fear[0].permute(1,2,0).cpu().detach().numpy()*255).astype(np.int) # im_name = +'.jpg' # cv2.imwrite('yanzheng/'+im_name,ig) return per_fear, boxes
def crop_tracklets(self, pre_locations, frame): """ Crops relevant areas from frame based on a priori (pre_locations) object locations """ start = time.time() box_ids = [] box_list = [] # convert to array for id in pre_locations: box_ids.append(id) box_list.append(pre_locations[id][:4]) boxes = np.array(box_list) temp = np.zeros(boxes.shape) temp[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 temp[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 temp[:, 2] = boxes[:, 2] - boxes[:, 0] temp[:, 3] = (boxes[:, 3] - boxes[:, 1]) / temp[:, 2] boxes = temp # convert xysr boxes into xmin xmax ymin ymax # first row of zeros is batch index (batch is size 0) for ROI align new_boxes = np.zeros([len(boxes), 5]) # use either s or s x r for both dimensions, whichever is smaller,so crop is square box_scales = np.min(np.stack((boxes[:, 2], boxes[:, 2] * boxes[:, 3]), axis=1), axis=1) #/2.0 #expand box slightly box_scales = box_scales * self.ber # box expansion ratio new_boxes[:, 1] = boxes[:, 0] - box_scales / 2 new_boxes[:, 3] = boxes[:, 0] + box_scales / 2 new_boxes[:, 2] = boxes[:, 1] - box_scales / 2 new_boxes[:, 4] = boxes[:, 1] + box_scales / 2 torch_boxes = torch.from_numpy(new_boxes).float().to(self.device) # crop using roi align crops = roi_align(frame.unsqueeze(0), torch_boxes, (self.cs, self.cs)) self.time_metrics['pre_localize and align'] += time.time() - start return crops, new_boxes, box_ids, box_scales
def main(): import torch import numpy as np import random from detectron2.structures import Boxes from detectron2.modeling.poolers import ROIPooler # from detectron2.layers import ROIAlign from torchvision.ops import roi_align # np.printoptions(precision=4) # img = np.arange(25).reshape(5, 5).astype("float32") # img = np.tile(np.expand_dims(img, -1), [1, 1, 3]) # inputs = tf.convert_to_tensor(img) # inputs = tf.reshape(inputs, [1, 1, 5, 5, 3]) # boxes = tf.constant([[[1, 1, 3, 3]]], dtype=tf.float32) # pooled = _selective_crop_and_resize(inputs, boxes, tf.constant([[0]]), tf.constant([[5, 5]], tf.float32), 3, 3, 3, 0.5, True) # print(pooled[0, 0, :, :, 0]) # inputs = torch.from_numpy(img.transpose(2, 0, 1)[None, :, :].astype("float32")) # rois = torch.from_numpy(np.array([0, 1, 1, 3, 3]).astype("float32"))[None, :] # output = roi_align(inputs, rois, (3, 3), 1, 0, True) # print(output[0, 0]) np.random.seed(4) noise = np.random.uniform(0, 1, [32, 32, 1]) img = np.arange(32 * 32).reshape(32, 32, 1).astype("float32") img += noise inputs = tf.convert_to_tensor(img, tf.float32) inputs = tf.reshape(inputs, [1, 1, 32, 32, 1]) boxes = tf.constant([[[1, 1, 17, 17]]], dtype=tf.float32) pooled = _selective_crop_and_resize(inputs, boxes, tf.constant([[0]]), tf.constant([[31, 31]], tf.float32), 5, 5, 1, 0.5, True) print(pooled[0, 0, :, :, 0]) inputs = torch.from_numpy( img.transpose(2, 0, 1)[None, :, :].astype("float32")) # print(inputs.shape) rois = torch.from_numpy(np.array([0, 1, 1, 17, 17]).astype("float32"))[None, :] output = roi_align(inputs, rois, (5, 5), 1, 0, True) output = output.permute(0, 2, 3, 1) print(output[0, :, :, 0])
def compute_mask_loss(self, mask_predict, positive_gt_idx, box_predicts, targets): mask_gt = targets['mask'].split(targets['batch_len']) box_gt = targets['target'].split(targets['batch_len']) loss_mask_predicts = list() loss_mask_target = list() for mg, bg, mp, pgi, bp in zip(mask_gt, box_gt, mask_predict, positive_gt_idx, box_predicts): cls_idx = bg[:, 0].long()[pgi] mg_t = mg[pgi] mp = mp[range(len(mp)), cls_idx, :] bp_extend = torch.cat( [torch.arange(len(bp), device=bp.device)[:, None], bp], dim=-1) mt = roi_align(mg_t[:, None, :, :], bp_extend, (mp.shape[-1], mp.shape[-1]), 1.)[:, 0] loss_mask_target.append(mt) loss_mask_predicts.append(mp) loss_mask_predicts = torch.cat(loss_mask_predicts) loss_mask_target = torch.cat(loss_mask_target) mask_loss = self.bce(loss_mask_predicts, loss_mask_target) return mask_loss
def roi_pooler(fpn_fms, rois, stride, pool_shape, pooler_type): if pooler_type == "ROIAlign": pooler_aligned = False elif pooler_type == "ROIAlignV2": pooler_aligned = True else: raise ValueError("Unknown pooler type: {}".format(pooler_type)) assert len(fpn_fms) == len(stride) max_level = int(math.log2(stride[-1])) min_level = int(math.log2(stride[0])) assert (len(stride) == max_level - min_level + 1) level_assignments = assign_boxes_to_levels(rois, min_level, max_level, 224, 4) dtype, device = fpn_fms[0].dtype, fpn_fms[0].device output = torch.zeros((len(rois), fpn_fms[0].shape[1], pool_shape[0], pool_shape[1]), dtype=dtype, device=device) for level, (fm_level, scale_level) in enumerate(zip(fpn_fms, stride)): inds = torch.nonzero(level_assignments == level, as_tuple=False).squeeze(1) rois_level = rois[inds] output[inds] = roi_align(fm_level, rois_level, pool_shape, spatial_scale=1.0/scale_level, sampling_ratio=-1, aligned=pooler_aligned) return output
def torchvision_roi_align(self, features, proposals, spatial_levels): output_features = torch.zeros((len(proposals), 256, 7, 7)).cuda() for i, scale in enumerate(self.pooler_scales): # get feature level feature = features[i] idxs = torch.where(spatial_levels == i)[0] if len(idxs) == 0: continue # get proposal proposal = proposals[idxs] output_feature = ops.roi_align(feature.unsqueeze(0), [proposal], output_size=self.output_size, spatial_scale=scale, sampling_ratio=2) output_features[idxs, :, :, :] = output_features[ idxs, :, :, :] + output_feature return output_features
def test_2(): """Authenticate the pooled box pair features """ f = torch.rand(1, 3, 512, 512) boxes_h = torch.rand(256, 4) * 256; boxes_h[:, 2:] += boxes_h[:, :2] boxes_h = torch.cat([torch.zeros(256, 1), boxes_h], 1) boxes_o = torch.rand(256, 4) * 256; boxes_o[:, 2:] += boxes_o[:, :2] boxes_o = torch.cat([torch.zeros(256, 1), boxes_o], 1) boxes_union = torch.zeros_like(boxes_h) boxes_union[:, 1] = torch.min(boxes_h[:, 1], boxes_o[:, 1]) boxes_union[:, 2] = torch.min(boxes_h[:, 2], boxes_o[:, 2]) boxes_union[:, 3] = torch.max(boxes_h[:, 3], boxes_o[:, 3]) boxes_union[:, 4] = torch.max(boxes_h[:, 4], boxes_o[:, 4]) m = MaskedBoxPairPool( output_size=7, spatial_scale=[1.0], sampling_ratio=4 ) # Compute pooled box pair features out1 = m([f], [boxes_h[:, 1:]], [boxes_o[:, 1:]]) masks = m.construct_masks_for_box_pairs(f, 0, boxes_h, boxes_o) # Apply masks on feature maps f_stacked = f[boxes_union[:, 0].long()] * masks boxes_union[:, 0] = torch.arange(256) # Compute pooled box union features out2 = roi_align(f_stacked, boxes_union, output_size=(7,7), spatial_scale=1.0, sampling_ratio=4) # Compare the pooled features # The two feature maps should be exactly the same assert out1.shape == out2.shape, \ "Inconsistent feature map size" print("Feature maps are {}% matched.".format( 100 * torch.eq(out1, out2).sum() / torch.as_tensor(out1.shape).prod()))
def forward(self, features, rois): batch_pooled_feats=[] batch_size,_,height_0, width_0 = features[0].size() for b in range(batch_size): pooled_feats = [] for i in range(len(features)-1,-1,-1): keep_inds = (rois[b][:,6] == i) if (torch.sum(keep_inds) == 0): continue roi = rois[b][keep_inds] rois_cords = self.resize_rois(roi[:,1:5], features[i],height_0, width_0) # #print(rois_cords.shape) caused illegal memory error. converting to list seems to work -1/30 x = roi_align(features[i][b:b+1], [rois_cords], output_size=(self.aligned_width, self.aligned_height)) x = F.avg_pool2d(x, kernel_size=2, stride=1) pooled_feats.append(x) pooled_feats = torch.cat(pooled_feats, dim =0) pooled_feats = torch.unsqueeze(pooled_feats, dim = 0) batch_pooled_feats.append(pooled_feats) batch_pooled_feats = torch.cat(batch_pooled_feats, dim=0) batch_size , n_roi, c, h, w = batch_pooled_feats.size() batch_pooled_feats=batch_pooled_feats.view(batch_size*n_roi,-1) #print(batch_pooled_feats.size()) return batch_pooled_feats, batch_size, n_roi, c ,h ,w
def forward(self, x, coords): self.cshape = coords.shape batch_size, num_points = self.cshape[0], self.cshape[1] ctx_kwarg = self._ctx_kwarg(coords) coords = torch.reshape(coords, shape=(-1, 2)) idx = [i for i in range(coords.shape[1]-1, -1, -1)] idx = torch.LongTensor(idx).cuda() coords = torch.index_select(coords, 1, idx) if self.extraction_method == 'ROIAlign': coords = coords - 0.5 / self.spatial_scale coords2 = coords else: coords2 = coords rois = torch.cat((coords, coords2), dim=1) bi = torch.arange(start=0, end=batch_size, step=1, **ctx_kwarg) bi = bi.repeat(num_points) bi = torch.reshape(bi, shape=(-1, 1)).type(torch.float32) rois = torch.cat((bi.cuda(), rois.float()), dim=1) w = roi_align(x, rois, (1, 1), spatial_scale=self.spatial_scale) w = torch.reshape(w, shape=(w.shape[0], -1)) return w
def skip_track(track_path, tracker, det_step = 1, srr = 0, ber = 1, PLOT = True): init_frames = 3 fsld_max = det_step # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.cuda.empty_cache() # get CNNs try: detector localizer except: detector,localizer = load_models(device) localizer.eval() # Loop Setup frames,n_frames = load_all_frames(track_path,det_step,init_frames,cutoff = None) frame_num = 0 # iteration counter next_obj_id = 0 # next id for a new object (incremented during tracking) fsld = {} # fsld[id] stores frames since last detected for object id all_tracks = {} # stores states for each object all_classes = {} # stores class evidence for each object # for keeping track of what's using up time time_metrics = { "gpu_load":0, "predict":0, "pre_localize and align":0, "localize":0, "post_localize":0, "detect":0, "parse":0, "match":0, "match2":0, "update":0, "add and remove":0, "store":0, "plot":0 } # 3. Main Loop for (frame,dim,original_im) in frames: # 1. Move image to GPU start = time.time() frame = frame.to(device,non_blocking = True) if frame_num % det_step < init_frames: #if frame_num % det_step == 0: dim = dim.to(device,non_blocking = True) time_metrics['gpu_load'] += time.time() - start # 2. Predict next object locations start = time.time() try: # in the case that there are no active objects will throw exception tracker.predict() pre_locations = tracker.objs() except: pre_locations = [] time_metrics['predict'] += time.time() - start if frame_num % det_step < init_frames: #Use YOLO # 3a. YOLO detect detections = detector.detect2(frame,dim) torch.cuda.synchronize(device) time_metrics['detect'] += time.time() - start start = time.time() detections = detections.cpu() time_metrics['gpu_load'] += time.time() - start # postprocess detections start = time.time() detections = parse_detections(detections) time_metrics['parse'] += time.time() - start # 4a. Match, using Hungarian Algorithm start = time.time() pre_ids = [] pre_loc = [] for id in pre_locations: pre_ids.append(id) pre_loc.append(pre_locations[id]) pre_loc = np.array(pre_loc) # matchings[i] = [a,b] where a is index of pre_loc and b is index of detection matchings = match_hungarian(pre_loc,detections[:,:4],iou_cutoff = 0.05) time_metrics['match'] += time.time() - start # try: # start = time.time() # matchings2 = match_greedy(pre_loc,detections[:,:4], threshold = 200) # time_metrics['match2'] += time.time() - start # except: # print("failed") # 5a. Update tracked objects start = time.time() update_array = np.zeros([len(matchings),4]) update_ids = [] for i in range(len(matchings)): a = matchings[i,0] # index of pre_loc b = matchings[i,1] # index of detections update_array[i,:] = detections[b,:4] update_ids.append(pre_ids[a]) fsld[pre_ids[a]] = 0 # fsld = 0 since this id was detected this frame if len(update_array) > 0: tracker.update(update_array,update_ids) time_metrics['update'] += time.time() - start # 6a. For each detection not in matchings, add a new object start = time.time() new_array = np.zeros([len(detections) - len(matchings),4]) new_ids = [] cur_row = 0 for i in range(len(detections)): if len(matchings) == 0 or i not in matchings[:,1]: new_ids.append(next_obj_id) new_array[cur_row,:] = detections[i,:4] fsld[next_obj_id] = 0 all_tracks[next_obj_id] = np.zeros([n_frames,7]) all_classes[next_obj_id] = np.zeros(13) next_obj_id += 1 cur_row += 1 if len(new_array) > 0: tracker.add(new_array,new_ids) # 7a. For each untracked object, increment fsld for i in range(len(pre_ids)): try: if i not in matchings[:,0]: fsld[pre_ids[i]] += 1 except: fsld[pre_ids[i]] += 1 # 8a. remove lost objects removals = [] for id in pre_ids: if fsld[id] > fsld_max: removals.append(id) if len(removals) > 0: tracker.remove(removals) time_metrics['add and remove'] += time.time() - start elif True: # use Resnet # 3b. crop tracked objects from image start = time.time() # use predicted states to crop relevant portions of frame box_ids = [] box_list = [] # convert to array for id in pre_locations: box_ids.append(id) box_list.append(pre_locations[id][:4]) boxes = np.array(box_list) # convert xysr boxes into xmin xmax ymin ymax # first row of zeros is batch index (batch is size 0) for ROI align new_boxes = np.zeros([len(boxes),5]) # use either s or s x r for both dimensions, whichever is larger,so crop is square #box_scales = np.max(np.stack((boxes[:,2],boxes[:,2]*boxes[:,3]),axis = 1),axis = 1) box_scales = np.min(np.stack((boxes[:,2],boxes[:,2]*boxes[:,3]),axis = 1),axis = 1) #/2.0 #expand box slightly box_scales = box_scales * ber# box expansion ratio new_boxes[:,1] = boxes[:,0] - box_scales/2 new_boxes[:,3] = boxes[:,0] + box_scales/2 new_boxes[:,2] = boxes[:,1] - box_scales/2 new_boxes[:,4] = boxes[:,1] + box_scales/2 torch_boxes = torch.from_numpy(new_boxes).float().to(device) if True: # mask other bboxes # these boxes are not square rect_boxes = np.zeros([len(boxes),4]) rect_boxes[:,0] = boxes[:,0] - boxes[:,2] / 2.0 rect_boxes[:,1] = boxes[:,1] - boxes[:,2] * boxes[:,3] / 2.0 rect_boxes[:,2] = boxes[:,0] + boxes[:,2] / 2.0 rect_boxes[:,3] = boxes[:,1] + boxes[:,2] * boxes[:,3] / 2.0 rect_boxes = rect_boxes.astype(int) frame_copy = frame.clone() for rec in rect_boxes: frame_copy[:,rec[1]:rec[3],rec[0]:rec[2]] = 0 frame_copy = frame_copy.unsqueeze(0).repeat(len(boxes),1,1,1) # in each crop, replace active box with correct pixels for i in range(len(rect_boxes)): torch_boxes[i,0] = i # so images are indexed correctly rec = rect_boxes[i] frame_copy[i,:,rec[1]:rec[3],rec[0]:rec[2]] = frame[:,rec[1]:rec[3],rec[0]:rec[2]] else: frame_copy = frame.unsqueeze(0) # crop using roi align crops = roi_align(frame_copy,torch_boxes,(224,224)) time_metrics['pre_localize and align'] += time.time() - start # 4b. Localize objects using localizer start= time.time() cls_out,reg_out = localizer(crops) torch.cuda.synchronize() time_metrics['localize'] += time.time() - start start = time.time() if False: test_outputs(reg_out,crops) # store class predictions highest_conf,cls_preds = torch.max(cls_out,1) for i in range(len(cls_preds)): all_classes[box_ids[i]][cls_preds[i].item()] += 1 # 5b. convert to global image coordinates # these detections are relative to crops - convert to global image coords wer = 3 # window expansion ratio, was set during training detections = (reg_out* 224*wer - 224*(wer-1)/2) detections = detections.data.cpu() # add in original box offsets and scale outputs by original box scales detections[:,0] = detections[:,0]*box_scales/224 + new_boxes[:,1] detections[:,2] = detections[:,2]*box_scales/224 + new_boxes[:,1] detections[:,1] = detections[:,1]*box_scales/224 + new_boxes[:,2] detections[:,3] = detections[:,3]*box_scales/224 + new_boxes[:,2] # convert into xysr form output = np.zeros([len(detections),4]) output[:,0] = (detections[:,0] + detections[:,2]) / 2.0 output[:,1] = (detections[:,1] + detections[:,3]) / 2.0 output[:,2] = (detections[:,2] - detections[:,0]) output[:,3] = (detections[:,3] - detections[:,1]) / output[:,2] #lastly, replace scale and ratio with original values ## NOTE this is kind of a cludgey fix and ideally localizer should be better output[:,2:4] = srr*output[:,2:4] + (1-srr)*boxes[:,2:4] time_metrics['post_localize'] += time.time() - start detections = output # 6b. Update tracker start = time.time() # map regressed bboxes directly to objects for update step tracker.update(output,box_ids) time_metrics['update'] += time.time() - start # 7b. increment all fslds for i in range(len(pre_ids)): fsld[pre_ids[i]] += 1 # Low confidence removals if True: removals = [] locations = tracker.objs() for i in range(len(box_ids)): if highest_conf[i] < 3 and box_ids[i] in locations: removals.append(box_ids[i]) print("Removed low confidence object") tracker.remove(removals) # IOU suppression on overlapping bounding boxes if True: removals = [] locations = tracker.objs() for i in locations: for j in locations: if i != j: iou_metric = iou(locations[i],locations[j]) if iou_metric > 0.5: # determine which object has been around longer if len(all_classes[i]) > len(all_classes[j]): removals.append(j) else: removals.append(i) removals = list(set(removals)) tracker.remove(removals) # 9. Get all object locations and store in output dict start = time.time() post_locations = tracker.objs() for id in post_locations: all_tracks[id][frame_num,:] = post_locations[id][:7] time_metrics['store'] += time.time() - start # 10. Plot start = time.time() if PLOT: plot(original_im,detections,post_locations,all_classes,class_dict,frame = frame_num) time_metrics['plot'] += time.time() - start # increment frame counter if frame_num % 1000 == 0: print("Finished frame {}".format(frame_num)) frame_num += 1 torch.cuda.empty_cache() cv2.destroyAllWindows() del frames total_time = 0 for key in time_metrics: total_time += time_metrics[key] if False: print("Finished file {} for det_step {}".format(track_path,det_step)) print("\n\nTotal Framerate: {:.2f} fps".format(n_frames/total_time)) print("---------- per operation ----------") for key in time_metrics: print("{:.3f}s ({:.2f}%) on {}".format(time_metrics[key],time_metrics[key]/total_time*100,key)) #write final output final_output = [] for frame in range(n_frames): frame_objs = [] for id in all_tracks: bbox = all_tracks[id][frame] if bbox[0] != 0: obj_dict = {} obj_dict["id"] = id obj_dict["class_num"] = np.argmax(all_classes[id]) x0 = bbox[0] - bbox[2]/2.0 x1 = bbox[0] + bbox[2]/2.0 y0 = bbox[1] - bbox[2]*bbox[3]/2.0 y1 = bbox[1] + bbox[2]*bbox[3]/2.0 obj_dict["bbox"] = np.array([x0,y0,x1,y1]) frame_objs.append(obj_dict) final_output.append(frame_objs) return final_output, n_frames/total_time, time_metrics
def script_fn(input, rois, pool_size): # type: (Tensor, Tensor, int) -> Tensor return ops.roi_align(input, rois, pool_size, 1.0)[0]
def forward( self, x: Dict[str, Tensor], boxes: List[Tensor], image_shapes: List[Tuple[int, int]], ) -> Tensor: """ Args: x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. Returns: result (Tensor) """ x_filtered = [] for k, v in x.items(): if k in self.featmap_names: x_filtered.append(v) num_levels = len(x_filtered) rois = self.convert_to_roi_format(boxes) if self.scales is None: self.setup_scales(x_filtered, image_shapes) scales = self.scales assert scales is not None if num_levels == 1: return roi_align(x_filtered[0], rois, output_size=self.output_size, spatial_scale=scales[0], sampling_ratio=self.sampling_ratio) mapper = self.map_levels assert mapper is not None levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + self.output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.where(levels == level)[0] rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align(per_level_feature, rois_per_level, output_size=self.output_size, spatial_scale=scale, sampling_ratio=self.sampling_ratio) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: # result and result_idx_in_level's dtypes are based on dtypes of different # elements in x_filtered. x_filtered contains tensors output by different # layers. When autocast is active, it may choose different dtypes for # different layers' outputs. Therefore, we defensively match result's dtype # before copying elements from result_idx_in_level in the following op. # We need to cast manually (can't rely on autocast to cast for us) because # the op acts on result in-place, and autocast only affects out-of-place ops. result[idx_in_level] = result_idx_in_level.to(result.dtype) if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result
box_scales = np.min(np.stack( (boxes[:, 2], boxes[:, 2] * boxes[:, 3]), axis=1), axis=1) #/2.0 #expand box slightly ber = 1 box_scales = box_scales * ber # box expansion ratio new_boxes[:, 1] = boxes[:, 0] - box_scales / 2 new_boxes[:, 3] = boxes[:, 0] + box_scales / 2 new_boxes[:, 2] = boxes[:, 1] - box_scales / 2 new_boxes[:, 4] = boxes[:, 1] + box_scales / 2 torch_boxes = torch.from_numpy(new_boxes).float().to(device) # crop using roi align crops = roi_align(frame.unsqueeze(0), torch_boxes, (224, 224)) # 4b. Localize objects using localizer cls_out, reg_out = localizer(crops) torch.cuda.synchronize() # 5b. convert to global image coordinates # these detections are relative to crops - convert to global image coords wer = 3 # window expansion ratio, was set during training detections = (reg_out * 224 * wer - 224 * (wer - 1) / 2) detections = detections.data.cpu() # add in original box offsets and scale outputs by original box scales
def test_qroialign(self): """Make sure quantized version of RoIAlign is close to float version""" pool_size = 5 img_size = 10 n_channels = 2 num_imgs = 1 dtype = torch.float def make_rois(num_rois=1000): rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype) rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate return rois for aligned in (True, False): for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)): for qdtype in (torch.qint8, torch.quint8, torch.qint32): x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype) qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) rois = make_rois() qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype) x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs y = ops.roi_align( x, rois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1, aligned=aligned, ) qy = ops.roi_align( qx, qrois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1, aligned=aligned, ) # The output qy is itself a quantized tensor and there might have been a loss of info when it was # quantized. For a fair comparison we need to quantize y as well quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype) try: # Ideally, we would assert this, which passes with (scale, zero) == (1, 0) self.assertTrue((qy == quantized_float_y).all()) except AssertionError: # But because the computation aren't exactly the same between the 2 RoIAlign procedures, some # rounding error may lead to a difference of 2 in the output. # For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44 # but 45.00000001 will be rounded to 46. We make sure below that: # - such discrepancies between qy and quantized_float_y are very rare (less then 5%) # - any difference between qy and quantized_float_y is == scale diff_idx = torch.where(qy != quantized_float_y) num_diff = diff_idx[0].numel() self.assertTrue(num_diff / qy.numel() < .05) abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize()) t_scale = torch.full_like(abs_diff, fill_value=scale) torch.testing.assert_close(abs_diff, t_scale, rtol=1e-5, atol=1e-5) x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype) qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8) rois = make_rois(10) qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8) with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"): ops.roi_align(qx, qrois, output_size=pool_size)
def forward(self, images: torch.Tensor, boxes_coordinate: torch.Tensor, transcripts: torch.Tensor, src_key_padding_mask: torch.Tensor): ''' :param images: whole_images, shape is (B, N, H, W, C), where B is batch size, N is the number of segments of the documents, H is height of image, W is width of image, C is channel of images (default is 3). :param boxes_coordinate: boxes coordinate, shape is (B, N, 8), where 8 is coordinates (x1, y1, x2, y2, x3, y3, x4, y4). :param transcripts: text segments, shape is (B, N, T, D), where T is the max length of transcripts, D is dimension of model. :param src_key_padding_mask: text padding mask, shape is (B*N, T), True for padding value. if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer of Transformer will be filled with -inf. need_weights: output attn_output_weights. :return: set of nodes X, shape is (B*N, T, D) ''' B, N, T, D = transcripts.shape # get image embedding using cnn # (B, 3, H, W) _, _, origin_H, origin_W = images.shape # image embedding: (B, C, H/16, W/16) images = self.cnn(images) _, C, H, W = images.shape # generate rois for roi pooling, rois shape is (B, N, 5), 5 means (batch_index, x0, y0, x1, y1) rois_batch = torch.zeros(B, N, 5, device=images.device) # Loop on the every image. for i in range(B): # (B, N, 8) # (N, 8) doc_boxes = boxes_coordinate[i] # (N, 4) pos = torch.stack([ doc_boxes[:, 0], doc_boxes[:, 1], doc_boxes[:, 4], doc_boxes[:, 5] ], dim=1) rois_batch[i, :, 1:5] = pos rois_batch[i, :, 0] = i spatial_scale = float(H / origin_H) # use roi pooling get image segments # (B*N, C, roi_pooling_size, roi_pooling_size) if self.roi_pooling_mode == 'roi_align': image_segments = roi_align(images, rois_batch.view(-1, 5), self.roi_pooling_size, spatial_scale) else: image_segments = roi_pool(images, rois_batch.view(-1, 5), self.roi_pooling_size, spatial_scale) # (B*N, D, 1, 1) image_segments = F.relu(self.bn(self.conv(image_segments))) # # (B*N, D,) image_segments = image_segments.squeeze() # (B*N, 1, D) image_segments = image_segments.unsqueeze(dim=1) # add positional embedding transcripts_segments = self.pe_droput( transcripts + self.position_embedding[:, :, :transcripts.size(2), :]) # (B*N, T ,D) transcripts_segments = transcripts_segments.reshape(B * N, T, D) # (B*N, T, D) image_segments = image_segments.expand_as(transcripts_segments) # here we first add image embedding and text embedding together, # then as the input of transformer to get a non-local fusion features, different from paper process. out = image_segments + transcripts_segments # (T, B*N, D) out = out.transpose(0, 1).contiguous() # (T, B*N, D) out = self.transformer_encoder( out, src_key_padding_mask=src_key_padding_mask) # (B*N, T, D) out = out.transpose(0, 1).contiguous() out = self.norm(out) out = F.dropout(out, p=self.dropout, training=self.training) return out
def forward(self, input): # args = get_args() if self.training: (intmd_fea, image, flang, bbox, pred_anchor, args) = input anchors_full = get_archors_full(args) batch_size = args.batch_size # n_neg=3 roi_feat_all = [] scores = [] # iou_all=best_n_list roi_batch_all = [] label_batch_all = [] lang_all = [] FG_THRESH = 0.5 BG_THRESH_HI = 0.5 BG_THRESH_LO = 0.00 fg_rois_per_image = 2 rois_per_image = 8 for scale_ii in range(len(pred_anchor)): grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // ( 2**scale_ii) anchor_idxs = [x + 3 * scale_ii for x in [0, 1, 2]] anchors = [anchors_full[i] for i in anchor_idxs] # scaled_anchors = torch.from_numpy(np.asarray([(x[0] / (args.anchor_imsize / grid), \ # x[1] / (args.anchor_imsize / grid)) for x in anchors])).float() ws = np.asarray([ np.round(x[0] * grid_size / (args.anchor_imsize / grid)) for x in anchors ]) hs = np.asarray([ np.round(x[1] * grid_size / (args.anchor_imsize / grid)) for x in anchors ]) x_ctr, y_ctr = (grid_size - 1) * 0.5, (grid_size - 1) * 0.5 scaled_anchors = torch.from_numpy( _mkanchors(ws, hs, x_ctr, y_ctr)).float().cuda() bbox_deltas = pred_anchor[scale_ii][:, :, :4, :, :] feat_height, feat_width = grid, grid shift_x = np.arange(0, feat_width) * grid_size shift_y = np.arange(0, feat_height) * grid_size shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = torch.from_numpy( np.vstack( (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(bbox_deltas).float() A = 3 K = shifts.size(0) # self._anchors = self._anchors.type_as(scores) # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous() anchors = scaled_anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) bbox_deltas = bbox_deltas.permute(0, 1, 3, 4, 2).contiguous() bbox_deltas = bbox_deltas.view(batch_size, -1, 4) proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size, grid_size) # xyxy proposals = clip_boxes(proposals, args.size, batch_size) gt_boxes = bbox.clone().unsqueeze(1) #xyxy # gt_boxes_append = gt_boxes.new(gt_boxes.size()).zero_() # gt_boxes_append[:, :, 1:5] = gt_boxes[:, :, :4] # Include ground-truth boxes in the set of candidate rois all_rois = torch.cat([proposals, gt_boxes], 1) overlaps = bbox_overlaps_batch(all_rois, gt_boxes) max_overlaps, gt_assignment = torch.max(overlaps, 2) batch_size = overlaps.size(0) num_proposal = overlaps.size(1) num_boxes_per_img = overlaps.size(2) offset = torch.arange(0, batch_size) * gt_boxes.size(1) offset = offset.view(-1, 1).type_as(gt_assignment) + gt_assignment labels = gt_boxes[:, :, 3] labels[:, :] = 1. labels = labels.contiguous().view(-1)[offset.view(-1)] \ .view(batch_size, -1) # labels = torch.ones(batch_size,1).cuda() # roi_size=(scale_ii+1)*7 labels_batch = labels.new(batch_size, rois_per_image).zero_() rois_batch = all_rois.new(batch_size, rois_per_image, 5).zero_() lang_batch = [] for i in range(batch_size): fg_inds = torch.nonzero( max_overlaps[i] >= FG_THRESH).view(-1) fg_num_rois = fg_inds.numel() # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = torch.nonzero( (max_overlaps[i] < BG_THRESH_HI) & (max_overlaps[i] >= BG_THRESH_LO)).view(-1) bg_num_rois = bg_inds.numel() if fg_num_rois > 0 and bg_num_rois > 0: # sampling fg fg_rois_per_this_image = fg_rois_per_image #min(fg_rois_per_image, fg_num_rois) # torch.randperm seems has a bug on multi-gpu setting that cause the segfault. # See https://github.com/pytorch/pytorch/issues/1868 for more details. # use numpy instead. # rand_num = torch.randperm(fg_num_rois).long().cuda() if fg_rois_per_image < fg_num_rois: rand_num = torch.from_numpy( np.random.permutation(fg_num_rois)).type_as( gt_boxes).long() fg_inds = fg_inds[ rand_num[:fg_rois_per_this_image]] else: rand_num = torch.from_numpy( np.random.choice( fg_num_rois, fg_rois_per_image, replace=True)).type_as(gt_boxes).long() fg_inds = fg_inds[rand_num] # sampling bg bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image # Seems torch.rand has a bug, it will generate very large number and make an error. # We use numpy rand instead. # rand_num = (torch.rand(bg_rois_per_this_image) * bg_num_rois).long().cuda() rand_num = np.floor( np.random.rand(bg_rois_per_this_image) * bg_num_rois) rand_num = torch.from_numpy(rand_num).type_as( gt_boxes).long() bg_inds = bg_inds[rand_num] elif fg_num_rois > 0 and bg_num_rois == 0: # sampling fg # rand_num = torch.floor(torch.rand(rois_per_image) * fg_num_rois).long().cuda() rand_num = np.floor( np.random.rand(rois_per_image) * fg_num_rois) rand_num = torch.from_numpy(rand_num).type_as( gt_boxes).long() fg_inds = fg_inds[rand_num] fg_rois_per_this_image = rois_per_image bg_rois_per_this_image = 0 elif bg_num_rois > 0 and fg_num_rois == 0: # sampling bg # rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda() rand_num = np.floor( np.random.rand(rois_per_image) * bg_num_rois) rand_num = torch.from_numpy(rand_num).type_as( gt_boxes).long() bg_inds = bg_inds[rand_num] bg_rois_per_this_image = rois_per_image fg_rois_per_this_image = 0 else: raise ValueError( "bg_num_rois = 0 and fg_num_rois = 0, this should not happen!" ) # The indices that we're selecting (both fg and bg) keep_inds = torch.cat([fg_inds, bg_inds], 0) # Select sampled values from various arrays: labels_batch[i].copy_(labels[i][keep_inds]) # Clamp labels for the background RoIs to 0 if fg_rois_per_this_image < rois_per_image: labels_batch[i][fg_rois_per_this_image:] = 0 rois_batch[i, :, 1:] = all_rois[i][keep_inds] rois_batch[i, :, 0] = i lang_batch.append(torch.stack([flang[i]] * rois_per_image)) roi_batch_all.append(rois_batch) label_batch_all.append(labels_batch) lang_all.append(torch.stack(lang_batch)) # for i in range(batch_size): roi_batch_all = torch.cat(roi_batch_all) label_batch_all = torch.cat(label_batch_all) flang = torch.cat(lang_all) for scale_ii in range(len(intmd_fea)): grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // ( 2**scale_ii) roi_size = (scale_ii + 1) * 7 feat_map = intmd_fea[scale_ii] # roi_scale=torch.cat([roi_batch_all.view(-1, 5)[:,0].unsqueeze(1),roi_batch_all.view(-1, 5)[:,1:]/grid_size],dim=1) roi_feat = roi_align(feat_map, roi_batch_all.view(-1, 5), [roi_size, roi_size], 1. / grid_size) roi_img = roi_align(image, roi_batch_all.view(-1, 5), [roi_size, roi_size]) roi_feat_all.append(torch.cat([roi_img, roi_feat], dim=1)) scores.append(label_batch_all.view(-1)) cam, cam_rv, bi_score = [], [], [] for ii in range(len(roi_feat_all)): # output=self.fcn_out._modules[str(ii)](roi_feat_all[ii]) emb = self.fcn_emb(roi_feat_all[ii]) output = self.fcn_out(emb) cam.append(output) cam_rv.append(self.PCM(output, emb, flang)) bi_score.append( F.adaptive_avg_pool2d(cam[ii], (1, 1)).squeeze()) return cam, cam_rv, bi_score, scores else: (intmd_fea, image, flang, seg_bbox, args) = input batch_size = seg_bbox.size(0) # feats = seg_bbox.unsqueeze(0) rois_batch = seg_bbox.new(batch_size, 5).zero_() for ii in range(batch_size): rois_batch[ii, 1:] = seg_bbox[ii] rois_batch[ii, 0] = ii roi_feat_all = [] for scale_ii in range(len(intmd_fea)): grid, grid_size = args.size // (32 // (2**scale_ii)), 32 // ( 2**scale_ii) roi_size = (scale_ii + 1) * 7 # for ii in range(batch_size): #[x.unsqueeze(0) for x in seg_bbox[scale_ii]] feat_map = intmd_fea[scale_ii] roi_feat = roi_align(feat_map, rois_batch, [roi_size, roi_size], 1. / grid_size) roi_img = roi_align(image, rois_batch, [roi_size, roi_size]) roi_feat_all.append(torch.cat([roi_img, roi_feat], dim=1)) cam, cam_rv, bi_score = [], [], [] for ii in range(len(roi_feat_all)): # output=self.fcn_out._modules[str(ii)](roi_feat_all[ii]) emb = self.fcn_emb(roi_feat_all[ii]) output = self.fcn_out(emb) cam.append(output) cam_rv.append(self.PCM(output, emb, flang)) bi_score.append( F.adaptive_avg_pool2d(cam[ii], (1, 1)).squeeze()) return cam, cam_rv, bi_score
def forward(self, x, boxes, image_shapes): # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> Tensor """ Arguments: x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. Returns: result (Tensor) """ x_filtered = [] for k, v in x.items(): if k in self.featmap_names: x_filtered.append(v) num_levels = len(x_filtered) rois = self.convert_to_roi_format(boxes) if self.scales is None: self.setup_scales(x_filtered, image_shapes) scales = self.scales assert scales is not None if num_levels == 1: return roi_align(x_filtered[0], rois, output_size=self.output_size, spatial_scale=scales[0], sampling_ratio=self.sampling_ratio) mapper = self.map_levels assert mapper is not None levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + self.output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.nonzero(levels == level).squeeze(1) rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align(per_level_feature, rois_per_level, output_size=self.output_size, spatial_scale=scale, sampling_ratio=self.sampling_ratio) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: result[idx_in_level] = result_idx_in_level if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result
def _PyramidRoI_Feat(self, feat_maps, rois, im_info): ''' roi pool on pyramid feature maps''' # do roi pooling based on predicted rois img_area = im_info[0][0] * im_info[0][1] h = rois.data[:, 4] - rois.data[:, 2] + 1 w = rois.data[:, 3] - rois.data[:, 1] + 1 roi_level = torch.log(torch.sqrt(h * w) / 224.0) roi_level = torch.round(roi_level + 4) roi_level[roi_level < 2] = 2 roi_level[roi_level > 5] = 5 # roi_level.fill_(5) if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) # NOTE: need to add pyrmaid grid_xy = _affine_grid_gen(rois, base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() roi_pool_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: roi_pool_feat = F.max_pool2d(roi_pool_feat, 2, 2) elif cfg.POOLING_MODE == 'align': roi_pool_feats = [] box_to_levels = [] for i, l in enumerate(range(2, 6)): if (roi_level == l).sum() == 0: continue # idx_l = (roi_level == l).nonzero().squeeze() idx_l = (roi_level == l).nonzero() if idx_l.shape[0] > 1: idx_l = idx_l.squeeze() else: idx_l = idx_l.view(-1) box_to_levels.append(idx_l) scale = feat_maps[i].size(2) / im_info[0][0] # pdb.set_trace() # feat = self.RCNN_roi_align(feat_maps[i], rois[idx_l], scale) feat = roi_align(feat_maps[i], rois[idx_l], (cfg.POOLING_SIZE, cfg.POOLING_SIZE), spatial_scale=scale, sampling_ratio=0) roi_pool_feats.append(feat) roi_pool_feat = torch.cat(roi_pool_feats, 0) box_to_level = torch.cat(box_to_levels, 0) idx_sorted, order = torch.sort(box_to_level) roi_pool_feat = roi_pool_feat[order] elif cfg.POOLING_MODE == 'pool': roi_pool_feats = [] box_to_levels = [] for i, l in enumerate(range(2, 6)): if (roi_level == l).sum() == 0: continue idx_l = (roi_level == l).nonzero().squeeze() box_to_levels.append(idx_l) scale = feat_maps[i].size(2) / im_info[0][0] feat = self.RCNN_roi_pool(feat_maps[i], rois[idx_l], scale) roi_pool_feats.append(feat) roi_pool_feat = torch.cat(roi_pool_feats, 0) box_to_level = torch.cat(box_to_levels, 0) idx_sorted, order = torch.sort(box_to_level) roi_pool_feat = roi_pool_feat[order] return roi_pool_feat