def convert_boxlist(maskrcnn_boxlist): box_tensor = maskrcnn_boxlist.bbox size = maskrcnn_boxlist.size mode = maskrcnn_boxlist.mode bbox = BoxList(box_tensor, size, mode) for field in maskrcnn_boxlist.fields(): bbox.add_field(field, maskrcnn_boxlist.get_field(field)) return bbox
def forward_for_single_feature_map(self, anchors, objectness, box_regression): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 4, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = objectness.permute(0, 2, 3, 1).reshape(N, -1) objectness = objectness.sigmoid() box_regression = box_regression.view(N, -1, 4, H, W).permute(0, 3, 4, 1, 2) box_regression = box_regression.reshape(N, -1, 4) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] box_regression = box_regression[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] proposals = self.box_coder.decode(box_regression.view(-1, 4), concat_anchors.view(-1, 4)) proposals = proposals.view(N, -1, 4) result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): boxlist = BoxList(proposal, im_shape, mode="xyxy") boxlist.add_field("objectness", score) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) boxlist = boxlist_nms( boxlist, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) result.append(boxlist) return result
def get_objects(self, idx, im_w, im_h): obj_boxes = self.return_null_box(im_w, im_h) if hasattr(self, 'det_objects'): boxes, box_score = self.det_objects[idx] if len(box_score) == 0: return obj_boxes obj_boxes_tensor = torch.as_tensor(boxes).reshape(-1, 4) obj_boxes = BoxList(obj_boxes_tensor, (im_w, im_h), mode="xywh").convert("xyxy") scores = torch.as_tensor(box_score) obj_boxes.add_field("scores", scores) return obj_boxes
def prepare_pooled_feature(x_pooled, boxes, detach=True): image_shapes = [box.size for box in boxes] boxes_per_image = [len(box) for box in boxes] box_tensors = [a.bbox for a in boxes] if detach: x_pooled = x_pooled.detach() pooled_feature = x_pooled.split(boxes_per_image, dim=0) boxes_result = [] for feature_per_image, boxes_per_image, image_shape in zip( pooled_feature, box_tensors, image_shapes): boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy") boxlist.add_field("pooled_feature", feature_per_image) boxes_result.append(boxlist) return boxes_result
def construct_mm_proposals(imgs): bbox = torch.tensor([[0., 0., imgs.shape[2], imgs.shape[3]]], dtype=torch.float32, device=imgs.device) mimicking_proposals = [BoxList(bbox, [imgs.size(2), imgs.size(3)])] * len(imgs) return mimicking_proposals
def __getitem__(self, idx): img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size) target.add_field("masks", masks) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx
def prepare_boxlist(self, boxes, scores, image_shape): """ Returns BoxList from `boxes` and adds probability scores information as an extra field `boxes` has shape (#detections, 4 * #classes), where each row represents a list of predicted bounding boxes for each of the object classes in the dataset (including the background class). The detections in each row originate from the same object proposal. `scores` has shape (#detection, #classes), where each row represents a list of object detection confidence scores for each of the object classes in the dataset (including the background class). `scores[i, j]`` corresponds to the box at `boxes[i, j * 4:(j + 1) * 4]`. """ boxes = boxes.reshape(-1, 4) scores = scores.reshape(-1) boxlist = BoxList(boxes, image_shape, mode="xyxy") boxlist.add_field("scores", scores) return boxlist
def __getitem__(self, idx): _, clip_info = self.clips_info[idx] # mov_id is the id in self.movie_info mov_id, timestamp = clip_info # movie_id is the human-readable youtube id. movie_id, movie_size = self.movie_info[mov_id] video_data = self._decode_video_data(movie_id, timestamp) im_w, im_h = movie_size if self.det_persons is None: # Note: During training, we only use gt. Thus we should not provide box file, # otherwise we will use only box file instead. boxes, packed_act = self.anns[idx] boxes_tensor = torch.as_tensor(boxes, dtype=torch.float32).reshape( -1, 4) # guard against no boxes boxes = BoxList(boxes_tensor, (im_w, im_h), mode="xywh").convert("xyxy") # Decode the packed bits from uint8 to one hot, since AVA has 80 classes, # it can be exactly denoted with 10 bytes, otherwise we may need to discard some bits. one_hot_label = np.unpackbits(packed_act, axis=1) one_hot_label = torch.as_tensor(one_hot_label, dtype=torch.uint8) boxes.add_field("labels", one_hot_label) else: boxes, box_score = self.det_persons[idx] boxes_tensor = torch.as_tensor(boxes).reshape(-1, 4) boxes = BoxList(boxes_tensor, (im_w, im_h), mode="xywh").convert("xyxy") boxes = boxes.clip_to_image(remove_empty=True) # extra fields extras = {} if self.transforms is not None: video_data, boxes, transform_randoms = self.transforms( video_data, boxes) slow_video, fast_video = video_data objects = None if self.det_objects is not None: objects = self.get_objects(idx, im_w, im_h) if self.object_transforms is not None: objects = self.object_transforms(objects, transform_randoms) # add infos neccessary for memory feature extras["movie_id"] = movie_id extras["timestamp"] = timestamp return slow_video, fast_video, boxes, objects, extras, idx return video_data, boxes, idx, movie_id, timestamp
def forward(self, x, boxes): """ Arguments: x (Tensor): the mask logits boxes (list[BoxList]): bounding boxes that are used as reference, one for ech image Returns: results (list[BoxList]): one BoxList for each image, containing the extra field mask """ mask_prob = x.sigmoid() # select masks coresponding to the predicted classes num_masks = x.shape[0] labels = [bbox.get_field("labels") for bbox in boxes] labels = torch.cat(labels) index = torch.arange(num_masks, device=labels.device) mask_prob = mask_prob[index, labels][:, None] boxes_per_image = [len(box) for box in boxes] mask_prob = mask_prob.split(boxes_per_image, dim=0) if self.masker: mask_prob = self.masker(mask_prob, boxes) results = [] for prob, box in zip(mask_prob, boxes): bbox = BoxList(box.bbox, box.size, mode="xyxy") for field in box.fields(): bbox.add_field(field, box.get_field(field)) bbox.add_field("mask", prob) results.append(bbox) return results
def forward(self, image_list, feature_maps): grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) anchors = [] for i, (image_height, image_width) in enumerate(image_list.image_sizes): anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: boxlist = BoxList( anchors_per_feature_map, (image_width, image_height), mode="xyxy" ) self.add_visibility_to(boxlist) anchors_in_image.append(boxlist) anchors.append(anchors_in_image) return anchors
def get_groundtruth(self, index): img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() anno = self._preprocess_annotation(anno) height, width = anno["im_info"] target = BoxList(anno["boxes"], (width, height), mode="xyxy") target.add_field("labels", anno["labels"]) target.add_field("difficult", anno["difficult"]) return target
def _get_target(self, index): target = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot()) objs = target['annotation']['object'] size = target['annotation']['size'] bboxs = [] classes = [] if not isinstance(objs, list): objs = [objs] for obj in objs: label = obj['name'] bbox = obj['bndbox'] xmin, ymin, xmax, ymax = int(bbox['xmin']), int(bbox['ymin']), int(bbox['xmax']), int( bbox['ymax']) bboxs.append((xmin, ymin, xmax, ymax)) classes.append(label) target_raw = BoxList(bboxs, (int(size['width']), int(size['height'])), mode='xyxy') return target_raw, classes
def filter_results(self, boxlist, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) device = scores.device result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class inds_all = scores > self.score_thresh for j in range(1, num_classes): inds = inds_all[:, j].nonzero().squeeze(1) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4:(j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms(boxlist_for_class, self.nms, score_field="scores") num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels, ), j, dtype=torch.int64, device=device)) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.detections_per_img + 1) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] return result
def evaluate_box_proposals( predictions, dataset, thresholds=None, area="all", limit=None ): """Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] # TODO replace with get_img_info? image_width = dataset.coco.imgs[original_id]["width"] image_height = dataset.coco.imgs[original_id]["height"] prediction = prediction.resize((image_width, image_height)) # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = prediction.get_field("objectness").sort(descending=True)[1] prediction = prediction[inds] ann_ids = dataset.coco.getAnnIds(imgIds=original_id) anno = dataset.coco.loadAnns(ann_ids) gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( "xyxy" ) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if len(prediction) == 0: continue if limit is not None and len(prediction) > limit: prediction = prediction[:limit] overlaps = boxlist_iou(prediction, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(prediction), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. """ n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists): pred_bbox = pred_boxlist.bbox.numpy() pred_label = pred_boxlist.get_field("labels").numpy() pred_score = pred_boxlist.get_field("scores").numpy() gt_bbox = gt_boxlist.bbox.numpy() gt_label = gt_boxlist.get_field("labels").numpy() gt_difficult = gt_boxlist.get_field("difficult").numpy() for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0, ) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = boxlist_iou( BoxList(pred_bbox_l, gt_boxlist.size), BoxList(gt_bbox_l, gt_boxlist.size), ).numpy() gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec
def prepare_boxlist(self, boxes, scores, image_shape): boxlist = BoxList(boxes, image_shape, mode="xyxy") boxlist.add_field("scores", scores) return boxlist
def _compute_prediction(self): '''The main loop of action prediction worker The main task of this separate process is compute the action score. However it behaves differently depends on whether it is in realtime mode. In realtime mode, it will compute the action scores right after the feature update. In video mode, the prediction won't be done until an explicit call of compute_prediction() ''' empty_flag = False for i in count(): if self.stopped: print("Avaworker stopped") return # if all video data have been processed and compute_prediction() has been called # compute predictions if self.task_done == True and empty_flag: print("The input queue is empty. Start working on prediction") for center_timestamp, video_size, ids in tqdm(self.timestamps): predictions = self.ava_predictor.compute_prediction( center_timestamp // self.interval, video_size) self.output_queue.put((predictions, center_timestamp, ids)) print("Prediction is done.") self.output_queue.put("done") self._task_done.value = False try: extra, video_size = self.input_queue.get(timeout=1) except queue.Empty: continue except FileNotFoundError: continue if extra == "Done": empty_flag = True continue frame, cur_millis, boxes, scores, ids = extra self.frame_stack.append(frame) self.extra_stack.append((cur_millis, boxes, scores, ids)) self.frame_stack = self.frame_stack[-self.frame_buffer_numbers:] self.extra_stack = self.extra_stack[-self.frame_buffer_numbers:] # Predict action once per interval if len( self.frame_stack ) >= self.frame_buffer_numbers and cur_millis > self.last_milli + self.interval: self.last_milli = cur_millis frame_arr = np.stack(self.frame_stack)[..., ::-1] center_index = self.frame_buffer_numbers // 2 center_timestamp, person_boxes, person_scores, person_ids = self.extra_stack[ center_index] if person_boxes is None or len(person_boxes) == 0: continue kframe = self.frame_stack[center_index] center_timestamp = int(center_timestamp) video_data, _, transform_randoms = self.vid_transforms( frame_arr, None) kframe_data = self.coco_det.image_preprocess(kframe) im_dim_list_k = kframe.shape[1], kframe.shape[0] im_dim_list_k = torch.FloatTensor(im_dim_list_k).repeat(1, 2) dets = self.coco_det.images_detection(kframe_data, im_dim_list_k) if isinstance(dets, int) or dets.shape[0] == 0: obj_boxes = torch.zeros((0, 4)) else: obj_boxes = dets[:, 1:5].cpu() obj_boxes = BoxList(obj_boxes, video_size, "xyxy").clip_to_image() person_box = BoxList(person_boxes, video_size, "xyxy").clip_to_image() self.ava_predictor.update_feature( video_data, person_box, obj_boxes, center_timestamp // self.interval, transform_randoms) if self.realtime: predictions = self.ava_predictor.compute_prediction( center_timestamp // self.interval, video_size) #print(len(predictions.get_field("scores")), person_ids) self.output_queue.put( (predictions, center_timestamp, person_ids[:, 0])) else: # if not realtime, timestamps will be saved and the predictions will be computed later. self.timestamps.append( (center_timestamp, video_size, person_ids[:, 0]))
def return_null_box(self, im_w, im_h): return BoxList(torch.zeros((0, 4)), (im_w, im_h), mode="xyxy")