def _preprocess(self, frames: np.ndarray, clip_len: int) -> np.ndarray: """ This method performs all preprocess operations and transformation that frames must pass through Parameters ---------- frames: np.ndarray Numpy array of shape (N, L, H, W, C), where N is the number of clips, L is the number of frames each clip has, H and W are the height and width of each frame and C are image channels clip_len: int Number of frames a model input must have Returns ------- clips_input: np.ndarray Preprocessed numpy array with all input frames """ transform_fn = video_tranforms.VideoGroupValTransform( size=self.FRAME_SIDE_SIZE, mean=self.IMAGENET_MEAN, std=self.IMAGENET_SD) clips_input = transform_fn(frames) clips_input = np.stack(clips_input, axis=0) clips_input = clips_input.reshape((-1, ) + (clip_len, 3, 224, 224)) clips_input = np.transpose(clips_input, (0, 2, 1, 3, 4)) return clips_input
def getModelForApp(): print(opt) gc.set_threshold(100, 5, 5) if not os.path.exists(opt['save_dir']): os.makedirs(opt['save_dir']) # set env gpu_id = opt['gpu_id'] context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] transform_test = video.VideoGroupValTransform(size=opt['input_size'], mean=image_norm_mean, std=image_norm_std) opt['num_crop'] = 1 classes = opt['num_classes'] model_name = opt['model'] net = get_model(name=model_name, nclass=classes, pretrained=opt['use_pretrained'], feat_ext=True, num_segments=opt['num_segments'], num_crop=opt['num_crop']) net.cast(opt['dtype']) net.collect_params().reset_ctx(context) print('Pre-trained model is successfully loaded from the model zoo.') print("Successfully built model {}".format(model_name)) return net , transform_test , context , model_name
def get_data_loader(opt, batch_size, num_workers, logger, kvstore=None): data_dir = opt.data_dir val_data_dir = opt.val_data_dir scale_ratios = [float(i) for i in opt.scale_ratios.split(',')] input_size = opt.input_size def batch_fn(batch, ctx): if opt.num_segments > 1: data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False, multiplier=opt.num_segments) else: data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label transform_train = video.VideoGroupTrainTransform(size=(input_size, input_size), scale_ratios=scale_ratios, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_test = video.VideoGroupValTransform(size=input_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if opt.dataset == 'kinetics400': train_dataset = kinetics400.classification.Kinetics400(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_train) val_dataset = kinetics400.classification.Kinetics400(setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'ucf101': train_dataset = ucf101.classification.UCF101(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_train) val_dataset = ucf101.classification.UCF101(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) logger.info('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) if kvstore is not None: train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, sampler=SplitSampler(len(train_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), batchify_fn=tsn_mp_batchify_fn, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, sampler=SplitSampler(len(val_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), batchify_fn=tsn_mp_batchify_fn, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') else: train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, batchify_fn=tsn_mp_batchify_fn, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, batchify_fn=tsn_mp_batchify_fn, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') return train_data, val_data, batch_fn
def get_action(self, net): if len(self.frames) < self.SAMPLE_DURATION: return None clip_input = self.frames transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) print(f"INFO: action input shape:") print([clip.shape for clip in clip_input]) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (32, 3, 224, 224)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) pred = net(nd.array(clip_input)) classes = net.classes topK = 1 ind = nd.topk(pred, k=topK)[0].astype('int') return classes[ind[0].asscalar()]
if opt.resume_params is not '': net.load_parameters(opt.resume_params, ctx=ctx) if opt.use_pretrained: net.features_3d.load_parameters(opt.pretrained_ECOfeature3d,ctx=ctx,allow_missing=True) net.output.load_parameters(opt.pretrained_ECOoutput,ctx=ctx,allow_missing=True) logger.info('use pretrained model : %s , %s',opt.pretrained_ECOfeature3d,opt.pretrained_ECOoutput) if opt.use_mult: net.collect_params(opt.freeze_patterns).setattr('lr_mult',opt.freeze_lr_mult) logger.info(net) net.collect_params().reset_ctx(ctx) transform_train = video.VideoGroupTrainTransform(size=(opt.input_size, opt.input_size), scale_ratios=[1.0, 0.875, 0.75, 0.66], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Batch Size for Each GPU per_device_batch_size = opt.per_device_batch_size # Number of data loader workers num_workers = opt.num_workers # Calculate effective total batch size batch_size = per_device_batch_size * num_gpus # Set train=True for training data. Here we only use a subset of UCF101 for demonstration purpose. # The subset has 101 training samples, one sample per class. train_dataset = UCF101(setting=opt.train_setting, root=opt.train_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length,new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size,
def main(): opt = parse_args() print(opt) # Garbage collection, default threshold is (700, 10, 10). # Set threshold lower to collect garbage more frequently and release more CPU memory for heavy data loading. gc.set_threshold(100, 5, 5) # set env num_gpus = opt.num_gpus batch_size = opt.batch_size batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers print('Total batch size is set to %d on %d GPUs' % (batch_size, num_gpus)) # get data if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform( size=opt.input_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params is not '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) print('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: print('Pre-trained model is successfully loaded from the model zoo.') if opt.dataset == 'ucf101': val_dataset = UCF101(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=opt.input_size, target_height=opt.input_size, test_mode=True, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'kinetics400': val_dataset = Kinetics400(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, test_mode=True, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': val_dataset = SomethingSomethingV2(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': val_dataset = HMDB51(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') print('Load %d test samples in %d iterations.' % (len(val_dataset), len(val_data))) start_time = time.time() acc_top1_val, acc_top5_val = test(context, val_data, opt, net) end_time = time.time() print('Test accuracy: acc-top1=%f acc-top5=%f' % (acc_top1_val * 100, acc_top5_val * 100)) print('Total evaluation time is %4.2f minutes' % ((end_time - start_time) / 60))
def get_data_loader(opt, batch_size, num_workers, logger, kvstore=None): data_dir = opt.data_dir val_data_dir = opt.val_data_dir scale_ratios = [float(i) for i in opt.scale_ratios.split(',')] input_size = opt.input_size default_mean = [0.485, 0.456, 0.406] default_std = [0.229, 0.224, 0.225] def batch_fn(batch, ctx): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label if opt.data_aug == 'v1': # GluonCV style, not keeping aspect ratio, multi-scale crop transform_train = video.VideoGroupTrainTransform( size=(input_size, input_size), scale_ratios=scale_ratios, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransform(size=input_size, mean=default_mean, std=default_std) elif opt.data_aug == 'v2': # GluonCV style, keeping aspect ratio, multi-scale crop, same as mmaction style transform_train = video.VideoGroupTrainTransformV2( size=(input_size, input_size), short_side=opt.new_height, scale_ratios=scale_ratios, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) elif opt.data_aug == 'v3': # PySlowFast style, keeping aspect ratio, random short side scale jittering transform_train = video.VideoGroupTrainTransformV3( crop_size=(input_size, input_size), min_size=opt.new_height, max_size=opt.new_width, mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) elif opt.data_aug == 'v4': # mmaction style, keeping aspect ratio, random crop and resize, only for SlowFast family models, similar to 'v3' transform_train = video.VideoGroupTrainTransformV4(size=(input_size, input_size), mean=default_mean, std=default_std) transform_test = video.VideoGroupValTransformV2( crop_size=(input_size, input_size), short_side=opt.new_height, mean=default_mean, std=default_std) else: logger.info('Data augmentation %s is not supported yet.' % (opt.data_aug)) if opt.dataset == 'kinetics400': train_dataset = Kinetics400( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = Kinetics400( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'ucf101': train_dataset = UCF101(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = UCF101(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=input_size, target_height=input_size, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': train_dataset = SomethingSomethingV2(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = SomethingSomethingV2(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': train_dataset = HMDB51(setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = HMDB51(setting=opt.val_list, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'custom': train_dataset = VideoClsCustom( setting=opt.train_list, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_train) val_dataset = VideoClsCustom( setting=opt.val_list, root=val_data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=input_size, target_height=input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) logger.info('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) if kvstore is not None: train_data = gluon.data.DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(train_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader( val_dataset, batch_size=batch_size, num_workers=num_workers, sampler=ShuffleSplitSampler(len(val_dataset), num_parts=kvstore.num_workers, part_index=kvstore.rank), prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') else: train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') return train_data, val_data, batch_fn
def main(logger): opt = parse_args() logger.info(opt) gc.set_threshold(100, 5, 5) if not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir) # set env if opt.gpu_id == -1: context = mx.cpu() else: gpu_id = opt.gpu_id context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, feat_ext=True, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params != '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) logger.info('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: logger.info( 'Pre-trained model is successfully loaded from the model zoo.') logger.info("Successfully built model {}".format(model_name)) # get data anno_file = opt.data_list f = open(anno_file, 'r') data_list = f.readlines() logger.info('Load %d video samples.' % len(data_list)) # build a pseudo dataset instance to use its children class methods video_utils = VideoClsCustom(root=opt.data_dir, setting=opt.data_list, num_segments=opt.num_segments, num_crop=opt.num_crop, new_length=opt.new_length, new_step=opt.new_step, new_width=opt.new_width, new_height=opt.new_height, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, lazy_init=True) start_time = time.time() for vid, vline in enumerate(data_list): video_path = vline.split()[0] video_name = video_path.split('/')[-1] if opt.need_root: video_path = os.path.join(opt.data_dir, video_path) video_data = read_data(opt, video_path, transform_test, video_utils) video_input = video_data.as_in_context(context) video_feat = net(video_input.astype(opt.dtype, copy=False)) feat_file = '%s_%s_feat.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, feat_file), video_feat.asnumpy()) if vid > 0 and vid % opt.log_interval == 0: logger.info('%04d/%04d is done' % (vid, len(data_list))) end_time = time.time() logger.info('Total feature extraction time is %4.2f minutes' % ((end_time - start_time) / 60))
video_fname = utils.download(url) vr = decord.VideoReader(video_fname) fast_frame_id_list = range(0, 64, 2) slow_frame_id_list = range(0, 64, 16) frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list) video_data = vr.get_batch(frame_id_list).asnumpy() clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)] ################################################################ # Now we define transformations for the video clip. # This transformation function does three things: # center crop each image to 224x224 in size, # transpose it to ``num_channels*num_frames*height*width``, # and normalize with mean and standard deviation calculated across all ImageNet images. transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1,) + (36, 3, 224, 224)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) print('Video data is downloaded and preprocessed.') ################################################################ # Next, we load a pre-trained SlowFast model with ResNet50 as backbone. model_name = 'slowfast_4x16_resnet50_kinetics400' net = get_model(model_name, nclass=400, pretrained=True) print('%s model is successfully loaded.' % model_name) ################################################################ # Finally, we prepare the video clip and feed it to the model.
def main(logger): opt = parse_args() makedirs(opt.save_dir) filehandler = logging.FileHandler( os.path.join(opt.save_dir, opt.logging_file)) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) gc.set_threshold(100, 5, 5) # set env gpu_id = opt.gpu_id context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params is not '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) logger.info('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: logger.info( 'Pre-trained model is successfully loaded from the model zoo.') logger.info("Successfully built model {}".format(model_name)) # get data anno_file = opt.data_list f = open(anno_file, 'r') data_list = f.readlines() logger.info('Load %d video samples.' % len(data_list)) start_time = time.time() for vid, vline in enumerate(data_list): video_path = vline.split()[0] video_name = video_path.split('/')[-1] if opt.need_root: video_path = os.path.join(opt.data_dir, video_path) video_data = read_data(opt, video_path, transform_test) video_input = video_data.as_in_context(context) pred = net(video_input.astype(opt.dtype, copy=False)) if opt.save_logits: logits_file = '%s_%s_logits.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, logits_file), pred.asnumpy()) pred_label = np.argmax(pred.asnumpy()) if opt.save_preds: preds_file = '%s_%s_preds.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, preds_file), pred_label) logger.info('%04d/%04d: %s is predicted to class %d' % (vid, len(data_list), video_name, pred_label)) end_time = time.time() logger.info('Total inference time is %4.2f minutes' % ((end_time - start_time) / 60))
def main(logger): opt = parse_args() print(opt) # Garbage collection, default threshold is (700, 10, 10). # Set threshold lower to collect garbage more frequently and release more CPU memory for heavy data loading. gc.set_threshold(100, 5, 5) # set env num_gpus = opt.num_gpus batch_size = opt.batch_size context = [mx.cpu()] if num_gpus > 0: batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] num_workers = opt.num_workers print('Total batch size is set to %d on %d GPUs' % (batch_size, num_gpus)) # get data image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 if not opt.deploy: # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model # Currently, these is no hashtag for int8 models. if opt.quantized: model_name += '_int8' opt.use_pretrained = True net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params is not '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) print('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: print( 'Pre-trained model is successfully loaded from the model zoo.') else: model_name = 'deploy' net = mx.gluon.SymbolBlock.imports( '{}-symbol.json'.format(opt.model_prefix), ['data'], '{}-0000.params'.format(opt.model_prefix)) net.hybridize(static_alloc=True, static_shape=True) print("Successfully loaded model {}".format(model_name)) # dummy data for benchmarking performance if opt.benchmark: benchmarking(opt, net, context) sys.exit() if opt.dataset == 'ucf101': val_dataset = UCF101(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, target_width=opt.input_size, target_height=opt.input_size, test_mode=True, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'kinetics400': val_dataset = Kinetics400( setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, test_mode=True, num_segments=opt.num_segments, num_crop=opt.num_crop, transform=transform_test) elif opt.dataset == 'somethingsomethingv2': val_dataset = SomethingSomethingV2(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) elif opt.dataset == 'hmdb51': val_dataset = HMDB51(setting=opt.val_list, root=opt.data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length, new_step=opt.new_step, target_width=opt.input_size, target_height=opt.input_size, video_loader=opt.video_loader, use_decord=opt.use_decord, num_segments=opt.num_segments, transform=transform_test) else: logger.info('Dataset %s is not supported yet.' % (opt.dataset)) val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') print('Load %d test samples in %d iterations.' % (len(val_dataset), len(val_data))) # calibrate FP32 model into INT8 model if opt.calibration: calibration(net, val_data, opt, context, logger) sys.exit() start_time = time.time() acc_top1_val, acc_top5_val = test(context, val_data, opt, net) end_time = time.time() print('Test accuracy: acc-top1=%f acc-top5=%f' % (acc_top1_val * 100, acc_top5_val * 100)) print('Total evaluation time is %4.2f minutes' % ((end_time - start_time) / 60))
def main(): opt = parse_args() makedirs(opt.save_dir) filehandler = logging.FileHandler( os.path.join(opt.save_dir, opt.logging_file)) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) gc.set_threshold(100, 5, 5) # set env if opt.gpu_id == -1: context = mx.cpu() else: gpu_id = opt.gpu_id context = mx.gpu(gpu_id) # get data preprocess image_norm_mean = [0.485, 0.456, 0.406] image_norm_std = [0.229, 0.224, 0.225] if opt.ten_crop: transform_test = transforms.Compose([ video.VideoTenCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 10 elif opt.three_crop: transform_test = transforms.Compose([ video.VideoThreeCrop(opt.input_size), video.VideoToTensor(), video.VideoNormalize(image_norm_mean, image_norm_std) ]) opt.num_crop = 3 else: transform_test = video.VideoGroupValTransform(size=opt.input_size, mean=image_norm_mean, std=image_norm_std) opt.num_crop = 1 # get model if opt.use_pretrained and len(opt.hashtag) > 0: opt.use_pretrained = opt.hashtag classes = opt.num_classes model_name = opt.model net = get_model(name=model_name, nclass=classes, pretrained=opt.use_pretrained, num_segments=opt.num_segments, num_crop=opt.num_crop) net.cast(opt.dtype) net.collect_params().reset_ctx(context) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if opt.resume_params != '' and not opt.use_pretrained: net.load_parameters(opt.resume_params, ctx=context) logger.info('Pre-trained model %s is successfully loaded.' % (opt.resume_params)) else: logger.info( 'Pre-trained model is successfully loaded from the model zoo.') logger.info("Successfully built model {}".format(model_name)) # get classes list, if we are using a pretrained network from the model_zoo classes = None if opt.use_pretrained: if "kinetics400" in model_name: classes = Kinetics400Attr().classes elif "ucf101" in model_name: classes = UCF101Attr().classes elif "hmdb51" in model_name: classes = HMDB51Attr().classes elif "sthsth" in model_name: classes = SomethingSomethingV2Attr().classes # get data anno_file = opt.data_list f = open(anno_file, 'r') data_list = f.readlines() logger.info('Load %d video samples.' % len(data_list)) # build a pseudo dataset instance to use its children class methods video_utils = VideoClsCustom(root=opt.data_dir, setting=opt.data_list, num_segments=opt.num_segments, num_crop=opt.num_crop, new_length=opt.new_length, new_step=opt.new_step, new_width=opt.new_width, new_height=opt.new_height, video_loader=opt.video_loader, use_decord=opt.use_decord, slowfast=opt.slowfast, slow_temporal_stride=opt.slow_temporal_stride, fast_temporal_stride=opt.fast_temporal_stride, data_aug=opt.data_aug, lazy_init=True) start_time = time.time() for vid, vline in enumerate(data_list): video_path = vline.split()[0] video_name = video_path.split('/')[-1] if opt.need_root: video_path = os.path.join(opt.data_dir, video_path) video_data = read_data(opt, video_path, transform_test, video_utils) video_input = video_data.as_in_context(context) pred = net(video_input.astype(opt.dtype, copy=False)) if opt.save_logits: logits_file = '%s_%s_logits.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, logits_file), pred.asnumpy()) pred_label = np.argmax(pred.asnumpy()) if opt.save_preds: preds_file = '%s_%s_preds.npy' % (model_name, video_name) np.save(os.path.join(opt.save_dir, preds_file), pred_label) # Try to report a text label instead of the number. if classes: pred_label = classes[pred_label] logger.info('%04d/%04d: %s is predicted to class %s' % (vid, len(data_list), video_name, pred_label)) end_time = time.time() logger.info('Total inference time is %4.2f minutes' % ((end_time - start_time) / 60))
def read_video_data(s3_video_path, num_frames=32): """Read and preprocess video data from the S3 bucket.""" print('read and preprocess video data here ') s3_client = boto3.client('s3') #print(uuid.uuid4()) fname = s3_video_path.replace('s3://', '') fname = fname.replace('S3://', '') fname = fname.replace('/', '') #download_path = '/tmp/{}-{}'.format(uuid.uuid4(), fname) #video_list_path = '/tmp/{}-{}'.format(uuid.uuid4(), 'video_list.txt') download_path = '/tmp/' + fname video_list_path = '/tmp/video_list' + str(uuid.uuid4()) + '.txt' bucket, key = get_bucket_and_key(s3_video_path) s3_client.download_file(bucket, key, download_path) #update download_path filename to be unique filename, ext = os.path.splitext(download_path) # save the file extension filename = filename + str(uuid.uuid4()) os.rename(download_path, filename + ext) download_path = filename + ext #Dummy duration and label with each video path video_list = '{} {} {}'.format(download_path, 10, 1) with open(video_list_path, 'w') as fopen: fopen.write(video_list) #Constants data_dir = '/tmp/' num_segments = 1 new_length = num_frames new_step = 1 use_decord = True video_loader = True slowfast = False #Preprocessing params #The transformation function does three things: center crop the image to 224x224 in size, transpose it to num_channels,num_frames,height*width, and normalize with mean and standard deviation calculated across all ImageNet images. #Use the general gluoncv dataloader VideoClsCustom to load the data with num_frames = 32 as the length. input_size = 224 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] transform = video.VideoGroupValTransform(size=input_size, mean=mean, std=std) video_utils = VideoClsCustom(root=data_dir, setting=video_list_path, num_segments=num_segments, new_length=new_length, new_step=new_step, video_loader=video_loader, use_decord=use_decord, slowfast=slowfast) #Read for the video list video_name = video_list.split()[0] decord = try_import_decord() decord_vr = decord.VideoReader(video_name) duration = len(decord_vr) skip_length = new_length * new_step segment_indices, skip_offsets = video_utils._sample_test_indices(duration) if video_loader: if slowfast: clip_input = video_utils._video_TSN_decord_slowfast_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: clip_input = video_utils._video_TSN_decord_batch_loader( video_name, decord_vr, duration, segment_indices, skip_offsets) else: raise RuntimeError('We only support video-based inference.') clip_input = transform(clip_input) if slowfast: sparse_sampels = len(clip_input) // (num_segments * num_crop) clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (sparse_sampels, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: clip_input = np.stack(clip_input, axis=0) clip_input = clip_input.reshape((-1, ) + (new_length, 3, input_size, input_size)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) if new_length == 1: clip_input = np.squeeze(clip_input, axis=2) # this is for 2D input case clip_input = nd.array(clip_input) #Cleanup temp files os.remove(download_path) os.remove(video_list_path) #os.system('rm {}'.format(download_path)) #os.system('rm {}'.format(video_list_path)) return clip_input
def get_data_loader(opt, batch_size, num_workers, logger): data_dir = opt.train_dir scale_ratios = [1.0, 0.875, 0.75, 0.66] input_size = opt.input_size def batch_fn(batch, ctx): data = split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) return data, label transform_train = video.VideoGroupTrainTransform( size=(input_size, input_size), scale_ratios=scale_ratios, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_test = video.VideoGroupValTransform(size=input_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if opt.dataset == 'ucf101': train_dataset = UCF101(setting=opt.train_setting, root=data_dir, train=True, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length_diff, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_train) val_dataset = UCF101(setting=opt.val_setting, root=data_dir, train=False, new_width=opt.new_width, new_height=opt.new_height, new_length=opt.new_length_diff, target_width=input_size, target_height=input_size, num_segments=opt.num_segments, transform=transform_test) else: # logger.info('Dataset %s is not supported yet.' % (opt.dataset)) print('Dataset %s is not supported yet.' % (opt.dataset)) print('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) # logger.info('Load %d training samples and %d validation samples.' % (len(train_dataset), len(val_dataset))) train_data = gluon.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='rollover') val_data = gluon.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, prefetch=int(opt.prefetch_ratio * num_workers), last_batch='discard') return train_data, val_data, batch_fn
def get_action_recognition(video_obj, model_arch="slowfast_4x16_resnet50_kinetics400"): ''' //TODO ''' # starting decord decord = try_import_decord() net = get_model(model_arch, pretrained=True) try: video_obj = utils.download(video_obj) except ValueError: pass vr = decord.VideoReader(video_obj) if "slowfast" in model_arch: fast_frame_id_list = range(0, 64, 2) slow_frame_id_list = range(0, 64, 16) frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list) else: frame_id_list = range(0, 64, 2) print("=========Reached here============") video_data = vr.get_batch(frame_id_list).asnumpy() clip_input = [ video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list) ] if "inceptionv3" in model_arch: transform_fn = video.VideoGroupValTransform(size=299, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) if "slowfast" in model_arch: clip_input = clip_input.reshape((-1, ) + (36, 3, 340, 450)) else: clip_input = clip_input.reshape((-1, ) + (32, 3, 340, 450)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) else: transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) clip_input = transform_fn(clip_input) clip_input = np.stack(clip_input, axis=0) if "slowfast" in model_arch: clip_input = clip_input.reshape((-1, ) + (36, 3, 224, 224)) else: clip_input = clip_input.reshape((-1, ) + (32, 3, 224, 224)) clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4)) pred = net(nd.array(clip_input)) classes = net.classes topK = 5 ind = nd.topk(pred, k=topK)[0].astype('int') resList = [] for i in range(topK): resList.append([ classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar() ]) resDF = pd.DataFrame(resList, columns=["class", "prob"]) return resDF
def main(): global args, best_loss # create model, load existing models from gluoncv print(" > Creating model ... !") num_gpus = args.num_gpus batch_size = args.batch_size batch_size *= max(1, num_gpus) context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = args.num_workers print('Total batch size is set to %d on %d GPUs' % (batch_size, num_gpus)) # =================== load model and parameters ======================= classes = args.num_classes model_name = args.model model = get_model(name=model_name, nclass=classes, pretrained=args.use_pretrained, num_segments=args.num_segments) model.cast(args.dtype) model.collect_params().reset_ctx(context) if args.mode == 'hybrid': model.hybridize(static_alloc=True, static_shape=True) if args.resume_params is not '' and not args.use_pretrained: model.load_parameters(args.resume_params, ctx=context) print('Pre-trained model %s is successfully loaded.' % (args.resume_params)) else: print('Pre-trained model is successfully loaded from the model zoo.') # ===================== load dataset ===================== global transform_post transform_post = video.VideoGroupValTransform(size=args.input_size, mean=[0, 0, 0], std=[1, 1, 1]) val_dataset = SomethingSomethingV2_revise(setting=args.val_list, root=args.data_dir, train=False, new_width=args.new_width, new_height=args.new_height, new_length=args.new_length, new_step=args.new_step, target_width=args.input_size, target_height=args.input_size, video_loader=args.video_loader, use_decord=args.use_decord, num_segments=args.num_segments, transform=transform_post) val_loader = gluon.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, #num_workers=num_workers, prefetch=int(args.prefetch_ratio * num_workers), batchify_fn=tsn_mp_batchify_fn, last_batch='discard') print('Load %d test samples in %d iterations.' % (len(val_dataset), len(val_loader))) # ========================== attack =========================== if args.eval_only: validate(val_loader, model, context) print(" > Evaluation DONE !") return