def do_inference(cfg, model, sample: DataSample, transforms=None, given_detection: DataSample = None) -> DataSample: """ Do inference on a specific video (sample) :param cfg: configuration file of the model :param model: a pytorch model :param sample: a testing video :param transforms: image-wise transform that prepares video frames for processing :param given_detection: the cached detections from other model, it means that the detection branch is disabled in the model forward pass :return: the detection results in the format of DataSample """ logger = logging.getLogger(__name__) model.eval() gpu_device = torch.device('cuda') video_loader = build_video_loader(cfg, sample, transforms) sample_result = DataSample(sample.id, raw_info=None, metadata=sample.metadata) network_time = 0 for (video_clip, frame_id, timestamps) in tqdm(video_loader): frame_id = frame_id.item() timestamps = torch.squeeze(timestamps, dim=0).tolist() video_clip = torch.squeeze(video_clip, dim=0) frame_detection = None # used the public provided detection (e.g. MOT17, HiEve) # the public detection needs to be ingested to DataSample # the ingested detection has been provided, find the details in readme/DATA.md if given_detection: frame_detection = given_detection.get_entities_for_frame_num(frame_id) frame_detection = convert_given_detections_to_boxlist(frame_detection, sample.width, sample.height) frame_height, frame_width = video_clip.shape[-2:] frame_detection = frame_detection.resize((frame_width, frame_height)) frame_detection = [frame_detection.to(gpu_device)] with torch.no_grad(): video_clip = video_clip.to(gpu_device) torch.cuda.synchronize() network_start_time = time.time() output_boxlists= model(video_clip, given_detection=frame_detection) torch.cuda.synchronize() network_time += time.time() - network_start_time # Resize to original image size and to xywh mode output_boxlists = [o.resize([sample.width, sample.height]).convert('xywh') for o in output_boxlists] output_boxlists = [o.to(torch.device("cpu")) for o in output_boxlists] output_entities = boxlists_to_entities(output_boxlists, frame_id, timestamps) for entity in output_entities: sample_result.add_entity(entity) logger.info('Sample_id {} / Speed {} fps'.format(sample.id, len(sample) / (network_time))) return sample_result
def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False): if sample is None: id_ = Path(csv_path).stem sample = DataSample(id_) else: sample = sample.get_copy_without_entities() with open(csv_path, newline='') as f: reader = csv.reader(f, delimiter=',') def coord(x): return round(float(x)) for row in reader: frame_num = int(row[0]) obj_id = row[1] x = coord(row[2]) y = coord(row[3]) w = coord(row[4]) h = coord(row[5]) conf = float(row[6]) # If not mot17 the last 3 are 3D coords which are usually -1 # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf) if has_gt and mot17: label = int(row[7]) visibility = float(row[8]) else: label = 1 visibility = 1 label_text = MOT_LABEL_MAP[label] # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested # but are ignored at evaluation time # i.e. (label != 1 and conf) is never true assert not (label != 1 and conf) has_person_label = label_text in ("Pedestrian") time_ms = int((frame_num - 1) / fps * 1000) entity = AnnoEntity(time=time_ms, id=obj_id) entity.bbox = [x, y, w, h] blob = { "frame_csv": frame_num, "frame_idx": frame_num - 1, "visibility": visibility } entity.labels = {} # entity.labels["person"] = 1 if has_person_label: entity.labels["person"] = 1 else: entity.labels[str(label)] = 1 entity.labels["vis"] = visibility entity.confidence = conf entity.blob = blob sample.add_entity(entity) return sample
def eval_det_ap(gt: list, pred: dict, class_table=None, data_filter_fn=None, iou_threshold=[0.5]): """ Evaluate the detection performance (COCO-style ap) on PoseTrack dataset :param gt: ground truth annotations for all videos :type gt: dict(vid_id: DataSample) :param pred: predictions for all videos :type pred: dict(vid_id: DataSample) :param data_filter_fn: a callable function that filters out detections that are not considered during evaluation :param class_table: class table specify the class order :param iou_threshold: :return: Average Precision (AP) over different thresholds """ if class_table is None: class_table = ["person"] num_classes = len(class_table) all_scores = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)] all_pr_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)] all_gt_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)] for (vid_id, vid_gt) in tqdm(gt): vid_pred = pred[vid_id] eval_frame_idxs = vid_gt.get_non_empty_frames() # Loop over all classes for class_id in range(0, num_classes): gt_class_entities = vid_gt.entities # gt_class_entities = vid_gt.get_entities_with_label(class_table[class_id]) pred_class_entities = vid_pred.get_entities_with_label(class_table[class_id]) # Wrap entities to a DataSample vid_class_gt = DataSample(vid_id, metadata=vid_gt.metadata) vid_class_pred = DataSample(vid_id, metadata=vid_pred.metadata) for _entity in gt_class_entities: vid_class_gt.add_entity(_entity) for _entity in pred_class_entities: vid_class_pred.add_entity(_entity) # Get AP for this class and video vid_class_scores, vid_class_pr_ious, vid_class_gt_ious = \ get_ap(vid_class_gt, vid_class_pred, data_filter_fn, eval_frame_idxs, iou_threshold) for iou_id in range(len(iou_threshold)): all_scores[class_id][iou_id] += vid_class_scores[iou_id] all_pr_ious[class_id][iou_id] += vid_class_pr_ious[iou_id] all_gt_ious[class_id][iou_id] += vid_class_gt_ious[iou_id] class_ap_matrix = np.zeros((num_classes, len(iou_threshold))) for class_id in range(num_classes): class_ap_matrix[class_id, :] = compute_AP(all_scores[class_id], all_pr_ious[class_id], all_gt_ious[class_id]) return class_ap_matrix
def _postprocess_tracks(self, tracks: DataSample): """ post_process the tracks to filter out short and non-confident tracks :param tracks: un-filtered tracks :return: filtered tracks that would be used for evaluation """ track_ids = set() for _entity in tracks.entities: if _entity.id not in track_ids and _entity.id >= 0: track_ids.add(_entity.id) filter_tracks = tracks.get_copy_without_entities() for _id in track_ids: _id_entities = tracks.get_entities_with_id(_id) _track_conf = np.mean([_e.confidence for _e in _id_entities]) if len(_id_entities) >= self._track_len \ and _track_conf >= self._track_conf: for _entity in _id_entities: filter_tracks.add_entity(_entity) return filter_tracks
def get_ap(vid_class_gt: DataSample, vid_class_pred: DataSample, filter_fn, eval_frame_idxs, iou_thresh=[0.5]): """ :param vid_class_gt: the ground truths for a specific class, in DataSample format :param vid_class_pred: the predictions for a specific class, in DataSample format :param filter_fn: a callable function to filter out detections :param eval_frame_idxs: the frame indexs where evaluation happens :param iou_thresh: the list of iou threshod that determines whether a detection is TP :returns vid_scores: the confidence for every predicted entity (a Python list) vid_pr_ious: the iou between the predicted entity and its matching gt entity (a Python list) vid_gt_ious: the iou between the gt entity and its matching predicted entity (a Python list) """ if not isinstance(iou_thresh, list): iou_thresh = [iou_thresh] vid_scores = [[] for _ in iou_thresh] vid_pr_ious = [[] for _ in iou_thresh] vid_gt_ious = [[] for _ in iou_thresh] for frame_idx in eval_frame_idxs: gt_entities = vid_class_gt.get_entities_for_frame_num(frame_idx) pred_entities = vid_class_pred.get_entities_for_frame_num(frame_idx) # Remove detections for evaluation that are within ignore regions if filter_fn is not None: # Filter out ignored gt entities gt_entities, ignore_gt_entities = filter_fn(gt_entities, meta_data=vid_class_gt.metadata) # Filter out predicted entities that overlaps with ignored gt entities pred_entities, ignore_pred_entities = filter_fn(pred_entities, ignore_gt_entities) # sort the entity based on confidence scores pred_entities = sorted(pred_entities, key=lambda x: x.confidence, reverse=True) iou_matrix = bbs_iou(pred_entities, gt_entities) scores = [entity.confidence for entity in pred_entities] for i, _iou in enumerate(iou_thresh): # pred_ious, gt_ious = target_matching(pred_entities, gt_entities) pred_ious, gt_ious = greedy_matching(copy.deepcopy(iou_matrix), _iou) vid_scores[i] += scores vid_pr_ious[i] += pred_ious vid_gt_ious[i] += gt_ious return vid_scores, vid_pr_ious, vid_gt_ious
def __init__(self, video: DataSample, clip_len=1, transforms=None): """ Construct a data loader for inference :param video: a video stream in DataSample format :param clip_len: the length of video clips :param transforms: transform function for video pre-processing """ self.video = video self.video_reader = video.get_data_reader() self.clip_len = clip_len self.transforms = transforms self.clip_idxs = list(range(0, len(self.video), self.clip_len))
def _inference_on_video(self, sample): cache_path = os.path.join(self._output_dir, '{}.json'.format(sample.id)) os.makedirs(os.path.dirname(cache_path), exist_ok=True) if os.path.exists(cache_path): sample_result = DataSample.load(cache_path) else: given_detection = None if self._pub_detection: given_detection = self._pub_detection[sample.id] sample_result = do_inference(self._cfg, self._model, sample, transforms=self._transform, given_detection=given_detection ) sample_result.dump(cache_path) return sample_result
def main(args, description="Initial ingestion", det_options=None, mot17=True): if mot17: if det_options is not None and not all(x in DET_OPTIONS for x in det_options): raise ValueError("Det options were {} but must be only: {}".format( det_options, DET_OPTIONS)) if det_options is None: det_options = DET_OPTIONS else: print("Ingesting MOT15, ignoring det options {}".format(det_options)) det_options = [""] dataset_path = args.dataset_path out_filename = args.anno_name out_dataset = GluonCVMotionDataset(out_filename, dataset_path, load_anno=False) metadata = { FieldNames.DESCRIPTION: description, FieldNames.DATE_MODIFIED: str(datetime.datetime.now()), } out_dataset.metadata = metadata splits = { "train": os.path.join(out_dataset.data_root_path, "train"), "test": os.path.join(out_dataset.data_root_path, "test"), # No gt for MOT test } for det_option in det_options: for split_name, split_path in splits.items(): subdirs = glob.glob(os.path.join(split_path, "*" + det_option)) for i, subdir in enumerate(subdirs): vid_id = os.path.basename(subdir) vid_path = os.path.join(split_path, subdir) sample = DataSample(vid_id) if mot17: info_path = os.path.join(vid_path, "seqinfo.ini") config = configparser.ConfigParser() config.read(info_path) seq_conf = config["Sequence"] fps = float(seq_conf['frameRate']) num_frames = int(seq_conf['seqLength']) width = int(seq_conf['imWidth']) height = int(seq_conf['imHeight']) else: # Assume 30 fps fps = 30 im_paths = glob.glob( os.path.join(vid_path, "img1", "*.jpg")) num_frames = len(im_paths) im_example = Image.open(im_paths[0]) width = im_example.width height = im_example.height rel_base_dir = vid_path.replace(out_dataset.data_root_path, "").lstrip(os.path.sep) rel_base_dir = os.path.join(rel_base_dir, "img1") metadata = { FieldNames.DATA_PATH: rel_base_dir, FieldNames.FPS: fps, FieldNames.NUM_FRAMES: num_frames, FieldNames.RESOLUTION: { "width": width, "height": height }, } sample.metadata = metadata gt_path = os.path.join(vid_path, "gt/gt.txt") det_path = os.path.join(vid_path, "det/det.txt") has_gt = os.path.exists(gt_path) anno_path = gt_path if has_gt else det_path sample = sample_from_mot_csv(anno_path, fps, sample, mot17, has_gt) out_dataset.add_sample(sample) print("Done {} sample {}/{}, {}".format( split_name, i + 1, len(subdirs), vid_id)) out_dataset.dump() return out_dataset