Exemplo n.º 1
0
def do_inference(cfg, model, sample: DataSample, transforms=None,
                 given_detection: DataSample = None) -> DataSample:
    """
    Do inference on a specific video (sample)
    :param cfg: configuration file of the model
    :param model: a pytorch model
    :param sample: a testing video
    :param transforms: image-wise transform that prepares
           video frames for processing
    :param given_detection: the cached detections from other model,
           it means that the detection branch is disabled in the
           model forward pass
    :return: the detection results in the format of DataSample
    """
    logger = logging.getLogger(__name__)
    model.eval()
    gpu_device = torch.device('cuda')

    video_loader = build_video_loader(cfg, sample, transforms)

    sample_result = DataSample(sample.id, raw_info=None, metadata=sample.metadata)
    network_time = 0
    for (video_clip, frame_id, timestamps) in tqdm(video_loader):
        frame_id = frame_id.item()
        timestamps = torch.squeeze(timestamps, dim=0).tolist()
        video_clip = torch.squeeze(video_clip, dim=0)

        frame_detection = None
        # used the public provided detection (e.g. MOT17, HiEve)
        # the public detection needs to be ingested to DataSample
        # the ingested detection has been provided, find the details in readme/DATA.md
        if given_detection:
            frame_detection = given_detection.get_entities_for_frame_num(frame_id)
            frame_detection = convert_given_detections_to_boxlist(frame_detection,
                                                                  sample.width,
                                                                  sample.height)
            frame_height, frame_width = video_clip.shape[-2:]
            frame_detection = frame_detection.resize((frame_width, frame_height))
            frame_detection = [frame_detection.to(gpu_device)]

        with torch.no_grad():
            video_clip = video_clip.to(gpu_device)
            torch.cuda.synchronize()
            network_start_time = time.time()
            output_boxlists= model(video_clip, given_detection=frame_detection)
            torch.cuda.synchronize()
            network_time += time.time() - network_start_time

        # Resize to original image size and to xywh mode
        output_boxlists = [o.resize([sample.width, sample.height]).convert('xywh')
                           for o in output_boxlists]
        output_boxlists = [o.to(torch.device("cpu")) for o in output_boxlists]
        output_entities = boxlists_to_entities(output_boxlists, frame_id, timestamps)
        for entity in output_entities:
            sample_result.add_entity(entity)

    logger.info('Sample_id {} / Speed {} fps'.format(sample.id, len(sample) / (network_time)))

    return sample_result
Exemplo n.º 2
0
def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False):
    if sample is None:
        id_ = Path(csv_path).stem
        sample = DataSample(id_)
    else:
        sample = sample.get_copy_without_entities()
    with open(csv_path, newline='') as f:
        reader = csv.reader(f, delimiter=',')

        def coord(x):
            return round(float(x))

        for row in reader:
            frame_num = int(row[0])
            obj_id = row[1]
            x = coord(row[2])
            y = coord(row[3])
            w = coord(row[4])
            h = coord(row[5])
            conf = float(row[6])
            # If not mot17 the last 3 are 3D coords which are usually -1
            # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf)
            if has_gt and mot17:
                label = int(row[7])
                visibility = float(row[8])
            else:
                label = 1
                visibility = 1

            label_text = MOT_LABEL_MAP[label]

            # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested
            # but are ignored at evaluation time
            # i.e. (label != 1 and conf) is never true
            assert not (label != 1 and conf)
            has_person_label = label_text in ("Pedestrian")

            time_ms = int((frame_num - 1) / fps * 1000)
            entity = AnnoEntity(time=time_ms, id=obj_id)
            entity.bbox = [x, y, w, h]
            blob = {
                "frame_csv": frame_num,
                "frame_idx": frame_num - 1,
                "visibility": visibility
            }
            entity.labels = {}
            # entity.labels["person"] = 1
            if has_person_label:
                entity.labels["person"] = 1
            else:
                entity.labels[str(label)] = 1
            entity.labels["vis"] = visibility

            entity.confidence = conf
            entity.blob = blob

            sample.add_entity(entity)
    return sample
Exemplo n.º 3
0
def eval_det_ap(gt: list, pred: dict, class_table=None, data_filter_fn=None, iou_threshold=[0.5]):
    """
    Evaluate the detection performance (COCO-style ap) on PoseTrack dataset
    :param gt: ground truth annotations for all videos
    :type gt: dict(vid_id: DataSample)
    :param pred: predictions for all videos
    :type pred: dict(vid_id: DataSample)
    :param data_filter_fn: a callable function that filters out detections that are not considered during evaluation
    :param class_table: class table specify the class order
    :param iou_threshold:
    :return: Average Precision (AP) over different thresholds
    """
    if class_table is None:
        class_table = ["person"]
    num_classes = len(class_table)

    all_scores = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]
    all_pr_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]
    all_gt_ious = [[[] for _ in range(len(iou_threshold))] for _ in range(num_classes)]

    for (vid_id, vid_gt) in tqdm(gt):
        vid_pred = pred[vid_id]

        eval_frame_idxs = vid_gt.get_non_empty_frames()

        # Loop over all classes
        for class_id in range(0, num_classes):
            gt_class_entities = vid_gt.entities
            # gt_class_entities = vid_gt.get_entities_with_label(class_table[class_id])
            pred_class_entities = vid_pred.get_entities_with_label(class_table[class_id])

            # Wrap entities to a DataSample
            vid_class_gt = DataSample(vid_id, metadata=vid_gt.metadata)
            vid_class_pred = DataSample(vid_id, metadata=vid_pred.metadata)
            for _entity in gt_class_entities:
                vid_class_gt.add_entity(_entity)
            for _entity in pred_class_entities:
                vid_class_pred.add_entity(_entity)

            # Get AP for this class and video
            vid_class_scores, vid_class_pr_ious, vid_class_gt_ious = \
                get_ap(vid_class_gt, vid_class_pred, data_filter_fn, eval_frame_idxs, iou_threshold)

            for iou_id in range(len(iou_threshold)):
                all_scores[class_id][iou_id] += vid_class_scores[iou_id]
                all_pr_ious[class_id][iou_id] += vid_class_pr_ious[iou_id]
                all_gt_ious[class_id][iou_id] += vid_class_gt_ious[iou_id]

    class_ap_matrix = np.zeros((num_classes, len(iou_threshold)))
    for class_id in range(num_classes):
        class_ap_matrix[class_id, :] = compute_AP(all_scores[class_id],
                                                  all_pr_ious[class_id],
                                                  all_gt_ious[class_id])

    return class_ap_matrix