def read_data(opt, video_name, transform, video_utils): decord = try_import_decord() decord_vr = decord.VideoReader(video_name, width=opt.new_width, height=opt.new_height) duration = len(decord_vr) opt.skip_length = opt.new_length * opt.new_step segment_indices, skip_offsets = video_utils._sample_test_indices(duration) if opt.video_loader: if opt.slowfast: clip_input = video_utils._video_TSN_decord_slowfast_loader(video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_utils._video_TSN_decord_batch_loader(video_name, decord_vr, duration, segment_indices, skip_offsets) else: raise RuntimeError('We only support video-based inference.') clip_input = transform(clip_input) if opt.slowfast: sparse_sampels = len(clip_input) // (opt.num_segments * opt.num_crop) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1,) + (sparse_sampels, 3, opt.input_size, opt.input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1,) + (opt.new_length, 3, opt.input_size, opt.input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if opt.new_length == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case return nd.array(clip_input)
def test_action_recognition(model): onnx_session = rt.InferenceSession(model.onnx_model, None) input_name = onnx_session.get_inputs()[0].name if not model.is_3D(): test_img_file = os.path.join('test_imgs', 'ThrowDiscus.png') img = mx.image.imread(test_img_file) img, _ = mx.image.center_crop(img, size=(model.input_shape[1], model.input_shape[2])) img = img.expand_dims(0).astype('float32') gluon_result = model.predict(img) onnx_result = onnx_session.run([], {input_name: img.asnumpy()})[0] assert_almost_equal(gluon_result.asnumpy(), onnx_result, decimal=3) else: from gluoncv.utils.filesystem import try_import_decord decord = try_import_decord() video_fname = os.path.join('test_videos', 'abseiling_k400.mp4') vr = decord.VideoReader(video_fname) frame_id_list = None if 'slowfast' in model.model_name: fast_frame_id_list = range(0, 64, 2) slow_frame_id_list = range( 0, 64, 16) if '4x16' in model.model_name else range(0, 64, 8) frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list) else: frame_id_list = list(range(0, 64, 2)) num_frames = len(frame_id_list) video_data = vr.get_batch(frame_id_list).asnumpy() clip_input = [ video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list) ] transform_fn = video.VideoGroupValTransform(size=model.input_shape[3], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (num_frames, 3, model.input_shape[3], model.input_shape[4])) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) gluon_result = model.predict(nd.array(clip_input)).asnumpy() onnx_result = onnx_session.run( [], {input_name: clip_input.astype('float32')})[0] assert_almost_equal(gluon_result, onnx_result, decimal=3)
from mxnet import gluon, nd, image from mxnet.gluon.data.vision import transforms from gluoncv.data.transforms import video from gluoncv import utils from gluoncv.model_zoo import get_model ################################################################ # Then, we download the video and extract a 64-frame clip from it. # Note that SlowFast has two branches, which require different inputs. # The fast branch needs more frames, which we sample every other frame (stride=2). # The slow branch needs less frames, which we sample every 16th frame (stride=16). # In the end, we have 32 frames as the input to the fast branch and 4 frames to the slow branch. # Hence, the final input to the whole network is a clip of 36 frames. from gluoncv.utils.filesystem import try_import_decord decord = try_import_decord() url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4' video_fname = utils.download(url) vr = decord.VideoReader(video_fname) fast_frame_id_list = range(0, 64, 2) slow_frame_id_list = range(0, 64, 16) frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list) video_data = vr.get_batch(frame_id_list).asnumpy() clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)] ################################################################ # Now we define transformations for the video clip. # This transformation function does three things: # center crop each image to 224x224 in size, # transpose it to ``num_channels*num_frames*height*width``,
def __init__( self, root_bgs=os.path.expanduser( '/media/hp/data/BGSDecom/FrameDifference/bgs'), root_fgs=os.path.expanduser( '/media/hp/data/BGSDecom/FrameDifference/fgs'), setting=os.path. expanduser( '/home/hp/.mxnet/datasets/ucf101/ucfTrainTestlist/ucf101_train_split_2_rawframes.txt' ), train=True, test_mode=False, name_pattern='img_%05d.jpg', video_ext='mp4', is_color=True, modality='rgb', num_segments_bgs=1, num_segments_fgs=1, new_length_bgs=1, new_length_fgs=5, new_step_bgs=1, new_step_fgs=1, new_width=340, new_height=256, target_width=224, target_height=224, temporal_jitter=False, video_loader=False, use_decord=False, transform=None): super(UCF101_2stream, self).__init__() from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv self.cv2 = try_import_cv2() self.root_bgs = root_bgs self.root_fgs = root_fgs self.setting = setting self.train = train self.test_mode = test_mode self.is_color = is_color self.modality = modality self.num_segments_bgs = num_segments_bgs self.num_segments_fgs = num_segments_fgs self.new_height = new_height self.new_width = new_width self.new_length_fgs = new_length_fgs self.new_length_bgs = new_length_bgs self.new_step_bgs = new_step_bgs self.new_step_fgs = new_step_fgs self.skip_length_bgs = self.new_length_bgs * self.new_step_bgs self.skip_length_fgs = self.new_length_fgs * self.new_step_fgs self.target_height = target_height self.target_width = target_width self.transform = transform self.temporal_jitter = temporal_jitter # False self.video_loader = video_loader self.video_ext = video_ext self.use_decord = use_decord if self.video_loader: if self.use_decord: self.decord = try_import_decord() else: self.mmcv = try_import_mmcv() # self.classes, self.class_to_idx = self._find_classes(root) self.clips = self._make_dataset(root_bgs, root_fgs, setting) if len(self.clips) == 0: raise (RuntimeError("Found 0 video clips in subfolders of: " + root_bgs + "\n" "Check your data directory (opt.data-dir).")) if name_pattern: self.name_pattern = name_pattern else: if self.modality == "rgb": self.name_pattern = "img_%05d.jpg" elif self.modality == "flow": self.name_pattern = "flow_%s_%05d.jpg"
def read_video_data(s3_video_path, num_frames=32): """Read and preprocess video data from the S3 bucket.""" print('read and preprocess video data here ') s3_client = boto3.client('s3') #print(uuid.uuid4()) fname = s3_video_path.replace('s3://', '') fname = fname.replace('S3://', '') fname = fname.replace('/', '') #download_path = '/tmp/{}-{}'.format(uuid.uuid4(), fname) #video_list_path = '/tmp/{}-{}'.format(uuid.uuid4(), 'video_list.txt') download_path = '/tmp/' + fname video_list_path = '/tmp/video_list' + str(uuid.uuid4()) + '.txt' bucket, key = get_bucket_and_key(s3_video_path) s3_client.download_file(bucket, key, download_path) #update download_path filename to be unique filename, ext = os.path.splitext(download_path) # save the file extension filename = filename + str(uuid.uuid4()) os.rename(download_path, filename + ext) download_path = filename + ext #Dummy duration and label with each video path video_list = '{} {} {}'.format(download_path, 10, 1) with open(video_list_path, 'w') as fopen: fopen.write(video_list) #Constants data_dir = '/tmp/' num_segments = 1 new_length = num_frames new_step = 1 use_decord = True video_loader = True slowfast = False #Preprocessing params #The transformation function does three things: center crop the image to 224x224 in size, transpose it to num_channels,num_frames,height*width, and normalize with mean and standard deviation calculated across all ImageNet images. #Use the general gluoncv dataloader VideoClsCustom to load the data with num_frames = 32 as the length. input_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] transform = video.VideoGroupValTransform(size=input_size, mean=mean, std=std) video_utils = VideoClsCustom(root=data_dir, setting=video_list_path, num_segments=num_segments, new_length=new_length, new_step=new_step, video_loader=video_loader, use_decord=use_decord, slowfast=slowfast) #Read for the video list video_name = video_list.split()[0] decord = try_import_decord() decord_vr = decord.VideoReader(video_name) duration = len(decord_vr) skip_length = new_length * new_step segment_indices, skip_offsets = video_utils._sample_test_indices(duration) if video_loader: if slowfast: clip_input = video_utils._video_TSN_decord_slowfast_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_utils._video_TSN_decord_batch_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: raise RuntimeError('We only support video-based inference.') clip_input = transform(clip_input) if slowfast: sparse_sampels = len(clip_input) // (num_segments * num_crop) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (sparse_sampels, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (new_length, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if new_length == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case clip_input = nd.array(clip_input) #Cleanup temp files os.remove(download_path) os.remove(video_list_path) #os.system('rm {}'.format(download_path)) #os.system('rm {}'.format(video_list_path)) return clip_input
def __init__(self, root, setting, train=True, test_mode=False, name_pattern='img_%05d.jpg', video_ext='mp4', is_color=True, modality='rgb', num_segments=1, num_crop=1, new_length=1, new_step=1, new_width=340, new_height=256, target_width=224, target_height=224, temporal_jitter=False, video_loader=False, use_decord=False, slowfast=False, slow_temporal_stride=16, fast_temporal_stride=2, data_aug='v1', lazy_init=False, transform=None): super(VideoClsCustom, self).__init__() from gluoncv.utils.filesystem import try_import_cv2, try_import_decord, try_import_mmcv self.cv2 = try_import_cv2() self.root = root self.setting = setting self.train = train self.test_mode = test_mode self.is_color = is_color self.modality = modality self.num_segments = num_segments self.num_crop = num_crop self.new_height = new_height self.new_width = new_width self.new_length = new_length self.new_step = new_step self.skip_length = self.new_length * self.new_step self.target_height = target_height self.target_width = target_width self.transform = transform self.temporal_jitter = temporal_jitter self.name_pattern = name_pattern self.video_loader = video_loader self.video_ext = video_ext self.use_decord = use_decord self.slowfast = slowfast self.slow_temporal_stride = slow_temporal_stride self.fast_temporal_stride = fast_temporal_stride self.data_aug = data_aug self.lazy_init = lazy_init if self.slowfast: assert slow_temporal_stride % fast_temporal_stride == 0, 'slow_temporal_stride needs to be multiples of slow_temporal_stride, please set it accordinly.' assert not temporal_jitter, 'Slowfast dataloader does not support temporal jitter. Please set temporal_jitter=False.' assert new_step == 1, 'Slowfast dataloader only support consecutive frames reading, please set new_step=1.' if self.video_loader: if self.use_decord: self.decord = try_import_decord() else: self.mmcv = try_import_mmcv() if not self.lazy_init: self.clips = self._make_dataset(root, setting) if len(self.clips) == 0: raise ( RuntimeError("Found 0 video clips in subfolders of: " + root + "\n" "Check your data directory (opt.data-dir)."))
def get_action_recognition(video_obj, model_arch="slowfast_4x16_resnet50_kinetics400"): ''' //TODO ''' # starting decord decord = try_import_decord() net = get_model(model_arch, pretrained=True) try: video_obj = utils.download(video_obj) except ValueError: pass vr = decord.VideoReader(video_obj) if "slowfast" in model_arch: fast_frame_id_list = range(0, 64, 2) slow_frame_id_list = range(0, 64, 16) frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list) else: frame_id_list = range(0, 64, 2) print("=========Reached here============") video_data = vr.get_batch(frame_id_list).asnumpy() clip_input = [ video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list) ] if "inceptionv3" in model_arch: transform_fn = video.VideoGroupValTransform(size=299, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) if "slowfast" in model_arch: clip_input = clip_input.reshape((-1, ) + (36, 3, 340, 450)) else: clip_input = clip_input.reshape((-1, ) + (32, 3, 340, 450)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) if "slowfast" in model_arch: clip_input = clip_input.reshape((-1, ) + (36, 3, 224, 224)) else: clip_input = clip_input.reshape((-1, ) + (32, 3, 224, 224)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) pred = net(nd.array(clip_input)) classes = net.classes topK = 5 ind = nd.topk(pred, k=topK)[0].astype('int') resList = [] for i in range(topK): resList.append([ classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar() ]) resDF = pd.DataFrame(resList, columns=["class", "prob"]) return resDF