Пример #1
0
def frame_extraction(video_path, short_side):
    """Extract frames given video_path.

    Args:
        video_path (str): The video_path.
    """
    # Load the video, extract frames into ./tmp/video_name
    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
    os.makedirs(target_dir, exist_ok=True)
    # Should be able to handle videos up to several hours
    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
    vid = cv2.VideoCapture(video_path)
    frames = []
    frame_paths = []
    flag, frame = vid.read()
    cnt = 0
    new_h, new_w = None, None
    while flag:
        if new_h is None:
            h, w, _ = frame.shape
            new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))

        frame = mmcv.imresize(frame, (new_w, new_h))

        frames.append(frame)
        frame_path = frame_tmpl.format(cnt + 1)
        frame_paths.append(frame_path)

        cv2.imwrite(frame_path, frame)
        cnt += 1
        flag, frame = vid.read()

    return frame_paths, frames
Пример #2
0
 def rescale(self, scale, interpolation=None):
     """see :func:`BaseInstanceMasks.rescale`"""
     new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
     if len(self.masks) == 0:
         rescaled_masks = PolygonMasks([], new_h, new_w)
     else:
         rescaled_masks = self.resize((new_h, new_w))
     return rescaled_masks
Пример #3
0
 def rescale(self, scale, interpolation='nearest'):
     """See :func:`BaseInstanceMasks.rescale`."""
     if len(self.masks) == 0:
         new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
         rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
     else:
         rescaled_masks = np.stack([
             mmcv.imrescale(mask, scale, interpolation=interpolation)
             for mask in self.masks
         ])
     height, width = rescaled_masks.shape[1:]
     return BitmapMasks(rescaled_masks, height, width)
Пример #4
0
def _resize_frames(frame_list,
                   scale,
                   keep_ratio=True,
                   interpolation='bilinear'):
    """resize frames according to given scale.

    Codes are modified from `mmaction2/datasets/pipelines/augmentation.py`,
    `Resize` class.

    Args:
        frame_list (list[np.ndarray]): frames to be resized.
        scale (tuple[int]): If keep_ratio is True, it serves as scaling
            factor or maximum size: the image will be rescaled as large
            as possible within the scale. Otherwise, it serves as (w, h)
            of output size.
        keep_ratio (bool): If set to True, Images will be resized without
            changing the aspect ratio. Otherwise, it will resize images to a
            given size. Default: True.
        interpolation (str): Algorithm used for interpolation:
            "nearest" | "bilinear". Default: "bilinear".
    Returns:
        list[np.ndarray]: Both GradCAM and Recognizer test stage inputs,
            including two keys, ``imgs`` and ``label``.
    """
    if scale is None or (scale[0] == -1 and scale[1] == -1):
        return frame_list
    scale = tuple(scale)
    max_long_edge = max(scale)
    max_short_edge = min(scale)
    if max_short_edge == -1:
        scale = (np.inf, max_long_edge)

    img_h, img_w, _ = frame_list[0].shape

    if keep_ratio:
        new_w, new_h = mmcv.rescale_size((img_w, img_h), scale)
    else:
        new_w, new_h = scale

    frame_list = [
        mmcv.imresize(img, (new_w, new_h), interpolation=interpolation)
        for img in frame_list
    ]

    return frame_list
Пример #5
0
 def _resize_img(self, results):
     for key in results.get('img_fields', ['img']):
         h, w = results[key].shape[:2]
         dw = w * self.jitter
         dh = h * self.jitter
         new_ar = (w + np.random.uniform(-dw, dw)) / (h + np.random.uniform(-dh, -dh))
         w = h * new_ar
         if self.keep_ratio:
             scale = mmcv.rescale_size((w, h), results['scale'])
         else:
             scale = results['scale']
         img, w_scale, h_scale = mmcv.imresize(results[key], scale, return_scale=True)
         scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
         results[key] = img
         results['img_shape'] = img.shape
         results['pad_shape'] = img.shape  # in case that there is no padding
         results['scale_factor'] = scale_factor
         results['keep_ratio'] = self.keep_ratio
Пример #6
0
    def __call__(self, results):
        w, h = results['img_info']['width'], results['img_info']['height']
        if self.keep_ratio:
            (new_w, new_h) = rescale_size((w, h),
                                          self.img_scale,
                                          return_scale=False)
            w_scale = new_w / w
            h_scale = new_h / h
        else:
            (new_w, new_h) = self.img_scale

        w_scale = new_w / w
        h_scale = new_h / h
        scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
                                dtype=np.float32)
        results['img_shape'] = (new_h, new_w, 1)
        results['scale_factor'] = scale_factor
        results['keep_ratio'] = True

        return results
Пример #7
0
    def __call__(self, results):
        """Performs the ResizeWithBox augmentation.

        Args:
            results (dict): The resulting dict to be modified and passed
                to the next transform in pipeline.
        """
        if 'scale_factor' not in results:
            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
        img_h, img_w = results['img_shape']

        if self.keep_ratio:
            new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale)
        else:
            new_w, new_h = self.scale

        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
                                     dtype=np.float32)

        results['img_shape'] = (new_h, new_w)
        results['keep_ratio'] = self.keep_ratio
        results['scale_factor'] = results['scale_factor'] * self.scale_factor

        if not self.lazy:
            results['imgs'] = [
                mmcv.imresize(img, (new_w, new_h),
                              interpolation=self.interpolation)
                for img in results['imgs']
            ]

            for idx in range(len(results['detections'])):
                cur_detections = results['detections'][idx]
                cur_detections[:, 0::2] = np.clip(
                    cur_detections[:, 0::2] * self.scale_factor[0], 0, new_w)
                cur_detections[:, 1::2] = np.clip(
                    cur_detections[:, 1::2] * self.scale_factor[1], 0, new_h)
                results['detections'][idx] = cur_detections
        else:
            raise NotImplementedError

        return results
    def rescale(self, scale, interpolation='nearest'):
        """Rescale masks as large as possible while keeping the aspect ratio.
        For details can refer to `mmcv.imrescale`

        Args:
            scale (tuple[int]): the maximum size (h, w) of rescaled mask
            interpolation (str): same as :func:`mmcv.imrescale`

        Returns:
            BitmapMasks: the rescaled masks
        """
        if len(self.masks) == 0:
            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
        else:
            rescaled_masks = np.stack([
                mmcv.imrescale(mask, scale, interpolation=interpolation)
                for mask in self.masks
            ])
        height, width = rescaled_masks.shape[1:]
        return BitmapMasks(rescaled_masks, height, width)
Пример #9
0
    def __call__(self, results):
        """Performs the Resize augmentation.

        Args:
            results (dict): The resulting dict to be modified and passed
                to the next transform in pipeline.
        """

        _init_lazy_if_proper(results, self.lazy)

        if 'scale_factor' not in results:
            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
        img_h, img_w = results['img_shape']

        if self.keep_ratio:
            new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale)
        else:
            new_w, new_h = self.scale

        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
                                     dtype=np.float32)

        results['img_shape'] = (new_h, new_w)
        results['keep_ratio'] = self.keep_ratio
        results['scale_factor'] = results['scale_factor'] * self.scale_factor

        if not self.lazy:
            results['imgs'] = [
                mmcv.imresize(img, (new_w, new_h),
                              interpolation=self.interpolation)
                for img in results['imgs']
            ]
        else:
            lazyop = results['lazy']
            if lazyop['flip']:
                raise NotImplementedError('Put Flip at last for now')
            lazyop['interpolation'] = self.interpolation

        return results
    def __call__(self, frames, proposals):
        frame_w, frame_h = frames[0].shape[1], frames[0].shape[0]
        new_w, new_h = mmcv.rescale_size((frame_w, frame_h), (256, np.Inf))
        w_ratio, h_ratio = new_w / frame_w, new_h / frame_h
        frames = [mmcv.imresize(img, (new_w, new_h)) for img in frames]
        _ = [mmcv.imnormalize_(frame, **self.img_norm_cfg) for frame in frames]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(frames).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(self.device)

        proposal = proposals[len(proposals) // 2]
        proposal = torch.from_numpy(proposal[:, :4]).to(self.device)
        if proposal.shape[0] == 0:
            return None

        proposal[:, 0:4:2] *= w_ratio
        proposal[:, 1:4:2] *= h_ratio
        with torch.no_grad():
            result = self.model(return_loss=False,
                                img=[input_tensor],
                                img_metas=[[dict(img_shape=(new_h, new_w))]],
                                proposals=[[proposal]])
        return self.post_proce(result, proposal)
Пример #11
0
    def test_rescale_size(self):
        new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True)
        assert new_size == (600, 450) and scale_factor == 1.5
        new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True)
        assert new_size == (374, 280) and scale_factor == 0.934

        new_size = mmcv.rescale_size((400, 300), 1.5)
        assert new_size == (600, 450)
        new_size = mmcv.rescale_size((400, 300), 0.934)
        assert new_size == (374, 280)

        new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600),
                                                   True)
        assert new_size == (800, 600) and scale_factor == 2.0
        new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200),
                                                   True)
        assert new_size == (200, 150) and scale_factor == 0.5

        new_size = mmcv.rescale_size((400, 300), (1000, 600))
        assert new_size == (800, 600)
        new_size = mmcv.rescale_size((400, 300), (180, 200))
        assert new_size == (200, 150)

        with pytest.raises(ValueError):
            mmcv.rescale_size((400, 300), -0.5)
        with pytest.raises(TypeError):
            mmcv.rescale_size()((400, 300), [100, 100])
Пример #12
0
def main():
    args = parse_args()

    # frame_paths, original_frames = frame_extraction(args.video)

    video_pathes = os.listdir(args.video)
    # frame_paths = sorted([osp.join(osp.join(args.video, video_base_path), x) for video_base_path in video_pathes for x in os.listdir(osp.join(args.video, video_base_path)) ])

    # single folder
    # video_path=args.video
    # frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)])

    for video_base_path in video_pathes:
        video_path = osp.join(args.video, video_base_path)
        frame_paths = sorted(
            [osp.join(video_path, x) for x in os.listdir(video_path)])

        # original_frames = []
        # for x in os.listdir(video_path):
        #     frame=cv2.imread(osp.join(video_path, x))
        #     original_frames.append(frame)

        # num_frame = len(frame_paths)
        frame = cv2.imread(frame_paths[0])
        h, w, _ = frame.shape

        # Load label_map
        # label_map = load_label_map(args.label_map)

        # resize frames to shortside 256
        new_w, new_h = mmcv.rescale_size((w, h), (1800, np.Inf))
        # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
        w_ratio, h_ratio = new_w / w, new_h / h

        human_detections = detection_inference(args, frame_paths)
        for i in range(len(human_detections)):
            det = human_detections[i]
            det[:, 0:4:2] *= w_ratio
            det[:, 1:4:2] *= h_ratio
            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)

        results_total = []
        for human_detection in human_detections:
            human_detection[:, 0::2] /= new_w
            human_detection[:, 1::2] /= new_h
            results = []
            for prop in human_detection:
                results.append((prop.data.cpu().numpy()))
            results_total.append(results)

        # xml
        target_dir = osp.join('./tmp',
                              osp.basename(osp.splitext(video_path)[0]))
        os.makedirs(target_dir, exist_ok=True)
        for frame_path, anno in zip(frame_paths, results_total):
            output_name = os.path.join(target_dir,
                                       os.path.basename(frame_path))
            create_tree(frame_path)
            scale_ratio = np.array([w, h, w, h])
            if anno is None:
                continue
            for ann in anno:
                box = ann
                box = (box * scale_ratio).astype(np.int64)
                label = "person"
                left, top, right, bottom = box.astype(float)
                create_object(annotation, label, left, top, right, bottom)

            tree = ET.ElementTree(annotation)
            root = tree.getroot()  # 得到根元素,Element类
            pretty_xml(root, '\t', '\n')  # 执行美化方法
            tree.write('%s.xml' % output_name.rstrip('.jpg'), encoding="utf-8")
Пример #13
0
def main():
    args = parse_args()

    frame_paths, original_frames = frame_extraction(args.video)
    num_frame = len(frame_paths)
    h, w, _ = original_frames[0].shape

    # resize frames to shortside 256
    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
    w_ratio, h_ratio = new_w / w, new_h / h

    # Get clip_len, frame_interval and calculate center index of each clip
    config = mmcv.Config.fromfile(args.config)
    config.merge_from_dict(args.cfg_options)
    val_pipeline = config.data.val.pipeline

    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    # Note that it's 1 based here
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           args.predict_stepsize)

    # Load label_map
    label_map = load_label_map(args.label_map)
    try:
        if config['data']['train']['custom_classes'] is not None:
            label_map = {
                id + 1: label_map[cls]
                for id, cls in enumerate(config['data']['train']
                                         ['custom_classes'])
            }
    except KeyError:
        pass

    # Get Human detection results
    center_frames = [frame_paths[ind - 1] for ind in timestamps]
    human_detections = detection_inference(args, center_frames)
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)

    # Get img_norm_cfg
    img_norm_cfg = config['img_norm_cfg']
    if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
        to_bgr = img_norm_cfg.pop('to_bgr')
        img_norm_cfg['to_rgb'] = to_bgr
    img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
    img_norm_cfg['std'] = np.array(img_norm_cfg['std'])

    # Build STDET model
    try:
        # In our spatiotemporal detection demo, different actions should have
        # the same number of bboxes.
        config['model']['test_cfg']['rcnn']['action_thr'] = .0
    except KeyError:
        pass

    config.model.backbone.pretrained = None
    model = build_detector(config.model, test_cfg=config.get('test_cfg'))

    load_checkpoint(model, args.checkpoint, map_location=args.device)
    model.to(args.device)
    model.eval()

    predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    assert len(timestamps) == len(human_detections)
    prog_bar = mmcv.ProgressBar(len(timestamps))
    for timestamp, proposal in zip(timestamps, human_detections):
        if proposal.shape[0] == 0:
            predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(args.device)

        with torch.no_grad():
            result = model(return_loss=False,
                           img=[input_tensor],
                           img_metas=[[dict(img_shape=(new_h, new_w))]],
                           proposals=[[proposal]])
            result = result[0]
            prediction = []
            # N proposals
            for i in range(proposal.shape[0]):
                prediction.append([])
            # Perform action score thr
            for i in range(len(result)):
                if i + 1 not in label_map:
                    continue
                for j in range(proposal.shape[0]):
                    if result[i][j, 4] > args.action_score_thr:
                        prediction[j].append((label_map[i + 1], result[i][j,
                                                                          4]))
            predictions.append(prediction)
        prog_bar.update()

    results = []
    for human_detection, prediction in zip(human_detections, predictions):
        results.append(pack_result(human_detection, prediction, new_h, new_w))

    def dense_timestamps(timestamps, n):
        """Make it nx frames."""
        old_frame_interval = (timestamps[1] - timestamps[0])
        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
        new_frame_inds = np.arange(
            len(timestamps) * n) * old_frame_interval / n + start
        return new_frame_inds.astype(np.int)

    dense_n = int(args.predict_stepsize / args.output_stepsize)
    frames = [
        cv2.imread(frame_paths[i - 1])
        for i in dense_timestamps(timestamps, dense_n)
    ]
    print('Performing visualization')
    vis_frames = visualize(frames, results)
    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
                                fps=args.output_fps)
    vid.write_videofile(args.out_filename)

    tmp_frame_dir = osp.dirname(frame_paths[0])
    shutil.rmtree(tmp_frame_dir)
    def __init__(self,
                 config,
                 display_height=0,
                 display_width=0,
                 input_video=0,
                 predict_stepsize=40,
                 output_fps=25,
                 clip_vis_length=8,
                 out_filename=None,
                 show=True,
                 stdet_input_shortside=256):
        # stdet sampling strategy
        val_pipeline = config['val_pipeline']
        sampler = [x for x in val_pipeline
                   if x['type'] == 'SampleAVAFrames'][0]
        clip_len, frame_interval = sampler['clip_len'], sampler[
            'frame_interval']
        self.window_size = clip_len * frame_interval

        # asserts
        assert (out_filename or show), \
            'out_filename and show cannot both be None'
        assert clip_len % 2 == 0, 'We would like to have an even clip_len'
        assert clip_vis_length <= predict_stepsize
        assert 0 < predict_stepsize <= self.window_size

        # source params
        try:
            self.cap = cv2.VideoCapture(int(input_video))
            self.webcam = True
        except ValueError:
            self.cap = cv2.VideoCapture(input_video)
            self.webcam = False
        assert self.cap.isOpened()

        # stdet input preprocessing params
        h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.stdet_input_size = mmcv.rescale_size(
            (w, h), (stdet_input_shortside, np.Inf))
        img_norm_cfg = config['img_norm_cfg']
        if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
            to_bgr = img_norm_cfg.pop('to_bgr')
            img_norm_cfg['to_rgb'] = to_bgr
        img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
        img_norm_cfg['std'] = np.array(img_norm_cfg['std'])
        self.img_norm_cfg = img_norm_cfg

        # task init params
        self.clip_vis_length = clip_vis_length
        self.predict_stepsize = predict_stepsize
        self.buffer_size = self.window_size - self.predict_stepsize
        frame_start = self.window_size // 2 - (clip_len // 2) * frame_interval
        self.frames_inds = [
            frame_start + frame_interval * i for i in range(clip_len)
        ]
        self.buffer = []
        self.processed_buffer = []

        # output/display params
        if display_height > 0 and display_width > 0:
            self.display_size = (display_width, display_height)
        elif display_height > 0 or display_width > 0:
            self.display_size = mmcv.rescale_size(
                (w, h), (np.Inf, max(display_height, display_width)))
        else:
            self.display_size = (w, h)
        self.ratio = tuple(
            n / o for n, o in zip(self.stdet_input_size, self.display_size))
        if output_fps <= 0:
            self.output_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
        else:
            self.output_fps = output_fps
        self.show = show
        self.video_writer = None
        if out_filename is not None:
            self.video_writer = self.get_output_video_writer(out_filename)
        display_start_idx = self.window_size // 2 - self.predict_stepsize // 2
        self.display_inds = [
            display_start_idx + i for i in range(self.predict_stepsize)
        ]

        # display multi-theading params
        self.display_id = -1  # task.id for display queue
        self.display_queue = {}
        self.display_lock = threading.Lock()
        self.output_lock = threading.Lock()

        # read multi-theading params
        self.read_id = -1  # task.id for read queue
        self.read_id_lock = threading.Lock()
        self.read_queue = queue.Queue()
        self.read_lock = threading.Lock()
        self.not_end = True  # cap.read() flag

        # program state
        self.stopped = False

        atexit.register(self.clean)
Пример #15
0
def main():
    args = parse_args()

    # frame_paths, original_frames = frame_extraction(args.video)
    #folder path
    video_path = args.video
    frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)])

    num_frame = len(frame_paths)
    # h, w, _ = original_frames[0].shape
    frame = cv2.imread(frame_paths[0])
    h, w, _ = frame.shape

    # Load label_map
    label_map = load_label_map(args.label_map)

    # resize frames to shortside 256
    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
    # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
    w_ratio, h_ratio = new_w / w, new_h / h

    # Get clip_len, frame_interval and calculate center index of each clip
    config = mmcv.Config.fromfile(args.config)
    val_pipeline = config['val_pipeline']
    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
    # if num_frame < clip_len * frame_interval:
    #         frame_interval=max(int(num_frame/clip_len)-1,0)
    window_size = clip_len * frame_interval
    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
    # Note that it's 1 based here
    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                           args.predict_stepsize)

    # Get Human detection results
    center_frames = [frame_paths[ind - 1] for ind in timestamps]
    human_detections = detection_inference(args, center_frames)
    for i in range(len(human_detections)):
        det = human_detections[i]
        det[:, 0:4:2] *= w_ratio
        det[:, 1:4:2] *= h_ratio
        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)

    # Get img_norm_cfg
    img_norm_cfg = config['img_norm_cfg']
    if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg:
        to_bgr = img_norm_cfg.pop('to_bgr')
        img_norm_cfg['to_rgb'] = to_bgr
    img_norm_cfg['mean'] = np.array(img_norm_cfg['mean'])
    img_norm_cfg['std'] = np.array(img_norm_cfg['std'])

    # Build STDET model
    config.model.backbone.pretrained = None
    model = build_detector(config.model, test_cfg=config.get('test_cfg'))

    load_checkpoint(model, args.checkpoint, map_location=args.device)
    model.to(args.device)
    model.eval()

    predictions = []

    print('Performing SpatioTemporal Action Detection for each clip')
    for timestamp, proposal in tqdm(zip(timestamps, human_detections)):
        if proposal.shape[0] == 0:
            predictions.append(None)
            continue

        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
        frame_inds = list(frame_inds - 1)
        imgs = [mmcv.imresize(cv2.imread(frame_paths[ind]), (new_w, new_h)).astype(np.float32) for ind in frame_inds]
        # imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
        # THWC -> CTHW -> 1CTHW
        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
        input_tensor = torch.from_numpy(input_array).to(args.device)

        with torch.no_grad():
            result = model(
                return_loss=False,
                img=[input_tensor],
                img_metas=[[dict(img_shape=(new_h, new_w))]],
                proposals=[[proposal]])
            result = result[0]
            prediction = []
            # N proposals
            for i in range(proposal.shape[0]):
                prediction.append([])
            # Perform action score thr
            for i in range(len(result)):
                if i + 1 not in label_map:
                    continue
                for j in range(proposal.shape[0]):
                    if result[i][j, 4] > args.action_score_thr:
                        prediction[j].append((label_map[i + 1], result[i][j,
                                                                          4]))
            predictions.append(prediction)

    results = []
    for human_detection, prediction in zip(human_detections, predictions):
        results.append(pack_result(human_detection, prediction, new_h, new_w))

    def dense_timestamps(timestamps, n):
        """Make it nx frames."""
        old_frame_interval = (timestamps[1] - timestamps[0])
        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
        new_frame_inds = np.arange(
            len(timestamps) * n) * old_frame_interval / n + start
        return new_frame_inds.astype(np.int)

    dense_n = int(args.predict_stepsize / args.output_stepsize)
    frames = [
        cv2.imread(frame_paths[i - 1])
        for i in dense_timestamps(timestamps, dense_n)
    ]
    print('Performing visualization')
    vis_frames = visualize(frames, results)
    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
                                fps=args.output_fps)
    vid.write_videofile(args.out_filename)
    #save image
    target_dir = osp.join('./tmp/test')
    os.makedirs(target_dir, exist_ok=True)
    frame_tmpl = osp.join(target_dir, 'img_%06d.jpg')
    vid.write_images_sequence(frame_tmpl,fps=args.output_fps)
Пример #16
0
def main():
    args = parse_args()

    frame_paths, original_frames = frame_extraction(args.video)
    num_frame = len(frame_paths)
    h, w, _ = original_frames[0].shape

    # Get Human detection results and pose results
    human_detections = detection_inference(args, frame_paths)
    pose_results = None
    if args.use_skeleton_recog or args.use_skeleton_stdet:
        pose_results = pose_inference(args, frame_paths, human_detections)

    # resize frames to shortside 256
    new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf))
    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
    w_ratio, h_ratio = new_w / w, new_h / h

    # Load spatio-temporal detection label_map
    stdet_label_map = load_label_map(args.label_map_stdet)
    rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config)
    rgb_stdet_config.merge_from_dict(args.cfg_options)
    try:
        if rgb_stdet_config['data']['train']['custom_classes'] is not None:
            stdet_label_map = {
                id + 1: stdet_label_map[cls]
                for id, cls in enumerate(rgb_stdet_config['data']['train']
                                         ['custom_classes'])
            }
    except KeyError:
        pass

    action_result = None
    if args.use_skeleton_recog:
        print('Use skeleton-based recognition')
        action_result = skeleton_based_action_recognition(
            args, pose_results, num_frame, h, w)
    else:
        print('Use rgb-based recognition')
        action_result = rgb_based_action_recognition(args)

    stdet_preds = None
    if args.use_skeleton_stdet:
        print('Use skeleton-based SpatioTemporal Action Detection')
        clip_len, frame_interval = 30, 1
        timestamps, stdet_preds = skeleton_based_stdet(args, stdet_label_map,
                                                       human_detections,
                                                       pose_results, num_frame,
                                                       clip_len,
                                                       frame_interval, h, w)
        for i in range(len(human_detections)):
            det = human_detections[i]
            det[:, 0:4:2] *= w_ratio
            det[:, 1:4:2] *= h_ratio
            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)

    else:
        print('Use rgb-based SpatioTemporal Action Detection')
        for i in range(len(human_detections)):
            det = human_detections[i]
            det[:, 0:4:2] *= w_ratio
            det[:, 1:4:2] *= h_ratio
            human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
        timestamps, stdet_preds = rgb_based_stdet(args, frames,
                                                  stdet_label_map,
                                                  human_detections, w, h,
                                                  new_w, new_h, w_ratio,
                                                  h_ratio)

    stdet_results = []
    for timestamp, prediction in zip(timestamps, stdet_preds):
        human_detection = human_detections[timestamp - 1]
        stdet_results.append(
            pack_result(human_detection, prediction, new_h, new_w))

    def dense_timestamps(timestamps, n):
        """Make it nx frames."""
        old_frame_interval = (timestamps[1] - timestamps[0])
        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
        new_frame_inds = np.arange(
            len(timestamps) * n) * old_frame_interval / n + start
        return new_frame_inds.astype(np.int)

    dense_n = int(args.predict_stepsize / args.output_stepsize)
    output_timestamps = dense_timestamps(timestamps, dense_n)
    frames = [
        cv2.imread(frame_paths[timestamp - 1])
        for timestamp in output_timestamps
    ]

    print('Performing visualization')
    pose_model = init_pose_model(args.pose_config, args.pose_checkpoint,
                                 args.device)

    if args.use_skeleton_recog or args.use_skeleton_stdet:
        pose_results = [
            pose_results[timestamp - 1] for timestamp in output_timestamps
        ]

    vis_frames = visualize(frames, stdet_results, pose_results, action_result,
                           pose_model)
    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
                                fps=args.output_fps)
    vid.write_videofile(args.out_filename)

    tmp_frame_dir = osp.dirname(frame_paths[0])
    shutil.rmtree(tmp_frame_dir)