Пример #1
0
def read_data(opt, video_name, transform, video_utils):

    decord = try_import_decord()
    decord_vr = decord.VideoReader(video_name, width=opt.new_width, height=opt.new_height)
    duration = len(decord_vr)

    opt.skip_length = opt.new_length * opt.new_step
    segment_indices, skip_offsets = video_utils._sample_test_indices(duration)

    if opt.video_loader:
        if opt.slowfast:
            clip_input = video_utils._video_TSN_decord_slowfast_loader(video_name, decord_vr, duration, segment_indices, skip_offsets)
        else:
            clip_input = video_utils._video_TSN_decord_batch_loader(video_name, decord_vr, duration, segment_indices, skip_offsets)
    else:
        raise RuntimeError('We only support video-based inference.')

    clip_input = transform(clip_input)

    if opt.slowfast:
        sparse_sampels = len(clip_input) // (opt.num_segments * opt.num_crop)
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1,) + (sparse_sampels, 3, opt.input_size, opt.input_size))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    else:
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1,) + (opt.new_length, 3, opt.input_size, opt.input_size))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    if opt.new_length == 1:
        clip_input = np.squeeze(clip_input, axis=2)    # this is for 2D input case

    return nd.array(clip_input)
Пример #2
0
def test_action_recognition(model):
    onnx_session = rt.InferenceSession(model.onnx_model, None)
    input_name = onnx_session.get_inputs()[0].name

    if not model.is_3D():
        test_img_file = os.path.join('test_imgs', 'ThrowDiscus.png')
        img = mx.image.imread(test_img_file)
        img, _ = mx.image.center_crop(img,
                                      size=(model.input_shape[1],
                                            model.input_shape[2]))
        img = img.expand_dims(0).astype('float32')

        gluon_result = model.predict(img)
        onnx_result = onnx_session.run([], {input_name: img.asnumpy()})[0]
        assert_almost_equal(gluon_result.asnumpy(), onnx_result, decimal=3)
    else:
        from gluoncv.utils.filesystem import try_import_decord
        decord = try_import_decord()
        video_fname = os.path.join('test_videos', 'abseiling_k400.mp4')
        vr = decord.VideoReader(video_fname)
        frame_id_list = None
        if 'slowfast' in model.model_name:
            fast_frame_id_list = range(0, 64, 2)
            slow_frame_id_list = range(
                0, 64, 16) if '4x16' in model.model_name else range(0, 64, 8)
            frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
        else:
            frame_id_list = list(range(0, 64, 2))
        num_frames = len(frame_id_list)
        video_data = vr.get_batch(frame_id_list).asnumpy()
        clip_input = [
            video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)
        ]
        transform_fn = video.VideoGroupValTransform(size=model.input_shape[3],
                                                    mean=[0.485, 0.456, 0.406],
                                                    std=[0.229, 0.224, 0.225])
        clip_input = transform_fn(clip_input)
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1, ) +
                                        (num_frames, 3, model.input_shape[3],
                                         model.input_shape[4]))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

        gluon_result = model.predict(nd.array(clip_input)).asnumpy()
        onnx_result = onnx_session.run(
            [], {input_name: clip_input.astype('float32')})[0]
        assert_almost_equal(gluon_result, onnx_result, decimal=3)
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

################################################################
# Then, we download the video and extract a 64-frame clip from it.
# Note that SlowFast has two branches, which require different inputs.
# The fast branch needs more frames, which we sample every other frame (stride=2).
# The slow branch needs less frames, which we sample every 16th frame (stride=16).
# In the end, we have 32 frames as the input to the fast branch and 4 frames to the slow branch.
# Hence, the final input to the whole network is a clip of 36 frames.

from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
fast_frame_id_list = range(0, 64, 2)
slow_frame_id_list = range(0, 64, 16)
frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

################################################################
# Now we define transformations for the video clip.
# This transformation function does three things:
# center crop each image to 224x224 in size,
# transpose it to ``num_channels*num_frames*height*width``,
Пример #4
0
    def __init__(
            self,
            root_bgs=os.path.expanduser(
                '/media/hp/data/BGSDecom/FrameDifference/bgs'),
            root_fgs=os.path.expanduser(
                '/media/hp/data/BGSDecom/FrameDifference/fgs'),
            setting=os.path.
        expanduser(
            '/home/hp/.mxnet/datasets/ucf101/ucfTrainTestlist/ucf101_train_split_2_rawframes.txt'
        ),
            train=True,
            test_mode=False,
            name_pattern='img_%05d.jpg',
            video_ext='mp4',
            is_color=True,
            modality='rgb',
            num_segments_bgs=1,
            num_segments_fgs=1,
            new_length_bgs=1,
            new_length_fgs=5,
            new_step_bgs=1,
            new_step_fgs=1,
            new_width=340,
            new_height=256,
            target_width=224,
            target_height=224,
            temporal_jitter=False,
            video_loader=False,
            use_decord=False,
            transform=None):

        super(UCF101_2stream, self).__init__()

        from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv
        self.cv2 = try_import_cv2()
        self.root_bgs = root_bgs
        self.root_fgs = root_fgs
        self.setting = setting
        self.train = train
        self.test_mode = test_mode
        self.is_color = is_color
        self.modality = modality
        self.num_segments_bgs = num_segments_bgs
        self.num_segments_fgs = num_segments_fgs
        self.new_height = new_height
        self.new_width = new_width
        self.new_length_fgs = new_length_fgs
        self.new_length_bgs = new_length_bgs
        self.new_step_bgs = new_step_bgs
        self.new_step_fgs = new_step_fgs
        self.skip_length_bgs = self.new_length_bgs * self.new_step_bgs
        self.skip_length_fgs = self.new_length_fgs * self.new_step_fgs
        self.target_height = target_height
        self.target_width = target_width
        self.transform = transform
        self.temporal_jitter = temporal_jitter  # False
        self.video_loader = video_loader
        self.video_ext = video_ext
        self.use_decord = use_decord

        if self.video_loader:
            if self.use_decord:
                self.decord = try_import_decord()
            else:
                self.mmcv = try_import_mmcv()

#        self.classes, self.class_to_idx = self._find_classes(root)
        self.clips = self._make_dataset(root_bgs, root_fgs, setting)
        if len(self.clips) == 0:
            raise (RuntimeError("Found 0 video clips in subfolders of: " +
                                root_bgs + "\n"
                                "Check your data directory (opt.data-dir)."))

        if name_pattern:
            self.name_pattern = name_pattern
        else:
            if self.modality == "rgb":
                self.name_pattern = "img_%05d.jpg"
            elif self.modality == "flow":
                self.name_pattern = "flow_%s_%05d.jpg"
def read_video_data(s3_video_path, num_frames=32):
    """Read and preprocess video data from the S3 bucket."""
    print('read and preprocess video data here ')
    s3_client = boto3.client('s3')
    #print(uuid.uuid4())
    fname = s3_video_path.replace('s3://', '')
    fname = fname.replace('S3://', '')
    fname = fname.replace('/', '')
    #download_path = '/tmp/{}-{}'.format(uuid.uuid4(), fname)
    #video_list_path = '/tmp/{}-{}'.format(uuid.uuid4(), 'video_list.txt')
    download_path = '/tmp/' + fname
    video_list_path = '/tmp/video_list' + str(uuid.uuid4()) + '.txt'
    bucket, key = get_bucket_and_key(s3_video_path)
    s3_client.download_file(bucket, key, download_path)

    #update download_path filename to be unique
    filename, ext = os.path.splitext(download_path)  # save the file extension
    filename = filename + str(uuid.uuid4())
    os.rename(download_path, filename + ext)
    download_path = filename + ext

    #Dummy duration and label with each video path
    video_list = '{} {} {}'.format(download_path, 10, 1)
    with open(video_list_path, 'w') as fopen:
        fopen.write(video_list)

    #Constants
    data_dir = '/tmp/'
    num_segments = 1
    new_length = num_frames
    new_step = 1
    use_decord = True
    video_loader = True
    slowfast = False
    #Preprocessing params

    #The transformation function does three things: center crop the image to 224x224 in size, transpose it to num_channels,num_frames,height*width, and normalize with mean and standard deviation calculated across all ImageNet images.

    #Use the general gluoncv dataloader VideoClsCustom to load the data with num_frames = 32 as the length.
    input_size = 224
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    transform = video.VideoGroupValTransform(size=input_size,
                                             mean=mean,
                                             std=std)
    video_utils = VideoClsCustom(root=data_dir,
                                 setting=video_list_path,
                                 num_segments=num_segments,
                                 new_length=new_length,
                                 new_step=new_step,
                                 video_loader=video_loader,
                                 use_decord=use_decord,
                                 slowfast=slowfast)

    #Read for the video list
    video_name = video_list.split()[0]

    decord = try_import_decord()
    decord_vr = decord.VideoReader(video_name)
    duration = len(decord_vr)

    skip_length = new_length * new_step
    segment_indices, skip_offsets = video_utils._sample_test_indices(duration)

    if video_loader:
        if slowfast:
            clip_input = video_utils._video_TSN_decord_slowfast_loader(
                video_name, decord_vr, duration, segment_indices, skip_offsets)
        else:
            clip_input = video_utils._video_TSN_decord_batch_loader(
                video_name, decord_vr, duration, segment_indices, skip_offsets)
    else:
        raise RuntimeError('We only support video-based inference.')

    clip_input = transform(clip_input)

    if slowfast:
        sparse_sampels = len(clip_input) // (num_segments * num_crop)
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1, ) + (sparse_sampels, 3,
                                                  input_size, input_size))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    else:
        clip_input = np.stack(clip_input, axis=0)
        clip_input = clip_input.reshape((-1, ) + (new_length, 3, input_size,
                                                  input_size))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    if new_length == 1:
        clip_input = np.squeeze(clip_input,
                                axis=2)  # this is for 2D input case

    clip_input = nd.array(clip_input)

    #Cleanup temp files
    os.remove(download_path)
    os.remove(video_list_path)
    #os.system('rm {}'.format(download_path))
    #os.system('rm {}'.format(video_list_path))

    return clip_input
    def __init__(self,
                 root,
                 setting,
                 train=True,
                 test_mode=False,
                 name_pattern='img_%05d.jpg',
                 video_ext='mp4',
                 is_color=True,
                 modality='rgb',
                 num_segments=1,
                 num_crop=1,
                 new_length=1,
                 new_step=1,
                 new_width=340,
                 new_height=256,
                 target_width=224,
                 target_height=224,
                 temporal_jitter=False,
                 video_loader=False,
                 use_decord=False,
                 slowfast=False,
                 slow_temporal_stride=16,
                 fast_temporal_stride=2,
                 data_aug='v1',
                 lazy_init=False,
                 transform=None):

        super(VideoClsCustom, self).__init__()

        from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv
        self.cv2 = try_import_cv2()
        self.root = root
        self.setting = setting
        self.train = train
        self.test_mode = test_mode
        self.is_color = is_color
        self.modality = modality
        self.num_segments = num_segments
        self.num_crop = num_crop
        self.new_height = new_height
        self.new_width = new_width
        self.new_length = new_length
        self.new_step = new_step
        self.skip_length = self.new_length * self.new_step
        self.target_height = target_height
        self.target_width = target_width
        self.transform = transform
        self.temporal_jitter = temporal_jitter
        self.name_pattern = name_pattern
        self.video_loader = video_loader
        self.video_ext = video_ext
        self.use_decord = use_decord
        self.slowfast = slowfast
        self.slow_temporal_stride = slow_temporal_stride
        self.fast_temporal_stride = fast_temporal_stride
        self.data_aug = data_aug
        self.lazy_init = lazy_init

        if self.slowfast:
            assert slow_temporal_stride % fast_temporal_stride == 0, 'slow_temporal_stride needs to be multiples of slow_temporal_stride, please set it accordinly.'
            assert not temporal_jitter, 'Slowfast dataloader does not support temporal jitter. Please set temporal_jitter=False.'
            assert new_step == 1, 'Slowfast dataloader only support consecutive frames reading, please set new_step=1.'

        if self.video_loader:
            if self.use_decord:
                self.decord = try_import_decord()
            else:
                self.mmcv = try_import_mmcv()

        if not self.lazy_init:
            self.clips = self._make_dataset(root, setting)
            if len(self.clips) == 0:
                raise (
                    RuntimeError("Found 0 video clips in subfolders of: " +
                                 root + "\n"
                                 "Check your data directory (opt.data-dir)."))
def get_action_recognition(video_obj,
                           model_arch="slowfast_4x16_resnet50_kinetics400"):
    '''
	//TODO
	'''
    # starting decord
    decord = try_import_decord()

    net = get_model(model_arch, pretrained=True)

    try:
        video_obj = utils.download(video_obj)
    except ValueError:
        pass

    vr = decord.VideoReader(video_obj)

    if "slowfast" in model_arch:
        fast_frame_id_list = range(0, 64, 2)
        slow_frame_id_list = range(0, 64, 16)
        frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
    else:
        frame_id_list = range(0, 64, 2)

    print("=========Reached here============")

    video_data = vr.get_batch(frame_id_list).asnumpy()
    clip_input = [
        video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)
    ]

    if "inceptionv3" in model_arch:
        transform_fn = video.VideoGroupValTransform(size=299,
                                                    mean=[0.485, 0.456, 0.406],
                                                    std=[0.229, 0.224, 0.225])
        clip_input = transform_fn(clip_input)
        clip_input = np.stack(clip_input, axis=0)
        if "slowfast" in model_arch:
            clip_input = clip_input.reshape((-1, ) + (36, 3, 340, 450))
        else:
            clip_input = clip_input.reshape((-1, ) + (32, 3, 340, 450))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
    else:
        transform_fn = video.VideoGroupValTransform(size=224,
                                                    mean=[0.485, 0.456, 0.406],
                                                    std=[0.229, 0.224, 0.225])
        clip_input = transform_fn(clip_input)
        clip_input = np.stack(clip_input, axis=0)
        if "slowfast" in model_arch:
            clip_input = clip_input.reshape((-1, ) + (36, 3, 224, 224))
        else:
            clip_input = clip_input.reshape((-1, ) + (32, 3, 224, 224))
        clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))

    pred = net(nd.array(clip_input))

    classes = net.classes
    topK = 5
    ind = nd.topk(pred, k=topK)[0].astype('int')
    resList = []

    for i in range(topK):
        resList.append([
            classes[ind[i].asscalar()],
            nd.softmax(pred)[0][ind[i]].asscalar()
        ])

    resDF = pd.DataFrame(resList, columns=["class", "prob"])
    return resDF