def test_compose(): with pytest.raises(TypeError): # transform must be callable or a dict Compose('LoadImage') target_keys = ['img', 'img_metas'] # test Compose given a data pipeline img = np.random.randn(256, 256, 3) results = dict(img=img, abandoned_key=None, img_name='test_image.png') test_pipeline = [ dict(type='Collect', keys=['img'], meta_keys=['img_name']), dict(type='ImageToTensor', keys=['img']) ] compose = Compose(test_pipeline) compose_results = compose(results) assert check_keys_equal(compose_results.keys(), target_keys) assert check_keys_equal(compose_results['img_metas'].data.keys(), ['img_name']) # test Compose when forward data is None results = None image_to_tensor = ImageToTensor(keys=[]) test_pipeline = [image_to_tensor] compose = Compose(test_pipeline) compose_results = compose(results) assert compose_results is None assert repr(compose) == compose.__class__.__name__ + \ f'(\n {image_to_tensor}\n)'
def main(): args = parse_args() args.device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) model = init_recognizer(cfg, args.checkpoint, device=args.device) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.data.test.pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 args.sample_length = sample_length args.test_pipeline = test_pipeline show_results(model, data, label, args)
def main(): global frame_queue, threshold, sample_length, data, test_pipeline, model, \ out_file, video_path, device, input_step, label, result_queue args = parse_args() input_step = args.input_step threshold = args.threshold video_path = args.video out_file = args.out_file device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) show_results()
def main(): global label, device, model, test_pipeline, \ camera, sample_length, average_size, threshold args = parse_args() device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) sample_length = args.sample_length average_size = args.average_size threshold = args.threshold with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: # Remove step to sample frames if sample_length == 0: sample_length = step['clip_len'] * step['num_clips'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 print('Press "Esc", "q" or "Q" to exit') predict_webcam_video()
def build_inputs(model, video_path, use_frames=False): """build inputs for GradCAM. Note that, building inputs for GradCAM is exactly the same as building inputs for Recognizer test stage. Codes from `inference_recognizer`. Args: model (nn.Module): Recognizer model. video_path (str): video file/url or rawframes directory. use_frames (bool): whether to use rawframes as input. Returns: dict: Both GradCAM inputs and Recognizer test stage inputs, including two keys, ``imgs`` and ``label``. """ if not (osp.exists(video_path) or video_path.startswith('http')): raise RuntimeError(f"'{video_path}' is missing") if osp.isfile(video_path) and use_frames: raise RuntimeError( f"'{video_path}' is a video file, not a rawframe directory") elif osp.isdir(video_path) and not use_frames: raise RuntimeError( f"'{video_path}' is a rawframe directory, not a video file") cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = cfg.data.test.pipeline test_pipeline = Compose(test_pipeline) # prepare data if use_frames: filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg') modality = cfg.data.test.get('modality', 'RGB') start_index = cfg.data.test.get('start_index', 1) data = dict( frame_dir=video_path, total_frames=len(os.listdir(video_path)), # assuming files in ``video_path`` are all named with ``filename_tmpl`` # noqa: E501 label=-1, start_index=start_index, filename_tmpl=filename_tmpl, modality=modality) else: start_index = cfg.data.test.get('start_index', 0) data = dict( filename=video_path, label=-1, start_index=start_index, modality='RGB') data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] return data
def main(): global frame_queue, camera, frame, results, threshold, sample_length, \ data, test_pipeline, model, device, average_size, label, \ result_queue, drawing_fps, inference_fps args = parse_args() average_size = args.average_size threshold = args.threshold drawing_fps = args.drawing_fps inference_fps = args.inference_fps device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) model = init_recognizer(cfg, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.data.test.pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 try: frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) pw = Thread(target=show_results, args=(), daemon=True) pr = Thread(target=inference, args=(), daemon=True) pw.start() pr.start() pw.join() except KeyboardInterrupt: pass
def main(): global frame_queue, camera, frame, results, threshold, sample_length, \ data, test_pipeline, model, device, average_size, label, result_queue args = parse_args() average_size = args.average_size threshold = args.threshold device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) #camera = cv2.VideoCapture('/home/ww/tools/image/office/2020-12-10_14-54-03.mp4') camera.set(cv2.CAP_PROP_FRAME_WIDTH, 640) camera.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 try: frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) pw = Thread(target=show_results, args=(), daemon=True) pr = Thread(target=inference, args=(), daemon=True) pw.start() pr.start() pw.join() except KeyboardInterrupt: pass
def skeleton_based_action_recognition(args, pose_results, num_frame, h, w): fake_anno = dict(frame_dict='', label=-1, img_shape=(h, w), origin_shape=(h, w), start_index=0, modality='Pose', total_frames=num_frame) num_person = max([len(x) for x in pose_results]) num_keypoint = 17 keypoint = np.zeros((num_person, num_frame, num_keypoint, 2), dtype=np.float16) keypoint_score = np.zeros((num_person, num_frame, num_keypoint), dtype=np.float16) for i, poses in enumerate(pose_results): for j, pose in enumerate(poses): pose = pose['keypoints'] keypoint[j, i] = pose[:, :2] keypoint_score[j, i] = pose[:, 2] fake_anno['keypoint'] = keypoint fake_anno['keypoint_score'] = keypoint_score label_map = [x.strip() for x in open(args.label_map).readlines()] num_class = len(label_map) skeleton_config = mmcv.Config.fromfile(args.skeleton_config) skeleton_config.model.cls_head.num_classes = num_class # for K400 dataset skeleton_pipeline = Compose(skeleton_config.test_pipeline) skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None] skeleton_imgs = skeleton_imgs.to(args.device) # Build skeleton-based recognition model skeleton_model = build_model(skeleton_config.model) load_checkpoint(skeleton_model, args.skeleton_checkpoint, map_location='cpu') skeleton_model.to(args.device) skeleton_model.eval() with torch.no_grad(): output = skeleton_model(return_loss=False, imgs=skeleton_imgs) action_idx = np.argmax(output) skeleton_action_result = label_map[ action_idx] # skeleton-based action result for the whole video return skeleton_action_result
def preprocess(video_path, cfg): test_pipeline = cfg.data.test.pipeline test_pipeline = Compose(test_pipeline) # prepare data start_index = cfg.data.test.get('start_index', 0) data = dict(filename=video_path, label=-1, start_index=start_index, modality='RGB') data = test_pipeline(data) #data = collate([data], samples_per_gpu=1) #if next(model.parameters()).is_cuda: # scatter to specified GPU # data = scatter(data, [device])[0] return data['imgs']
def test_compose_support_torchvision(): target_keys = ['imgs', 'img_metas'] # test Compose given a data pipeline imgs = [np.random.randn(256, 256, 3)] * 8 results = dict(imgs=imgs, abandoned_key=None, img_name='test_image.png', clip_len=8, num_clips=1) test_pipeline = [ dict(type='torchvision.Grayscale', num_output_channels=3), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs'], meta_keys=['img_name']), dict(type='ToTensor', keys=['imgs']) ] compose = Compose(test_pipeline) compose_results = compose(results) assert assert_keys_equal(compose_results.keys(), target_keys) assert assert_keys_equal(compose_results['img_metas'].data.keys(), ['img_name'])
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video, args.short_side) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) config.merge_from_dict(args.cfg_options) test_pipeline = Compose(config.data.test.pipeline) # Load label_map label_map = [x.strip() for x in open(args.label_map).readlines()] # Get Human detection results det_results = detection_inference(args, frame_paths) torch.cuda.empty_cache() pose_results = pose_inference(args, frame_paths, det_results) torch.cuda.empty_cache() fake_anno = dict(frame_dir='', label=-1, img_shape=(h, w), original_shape=(h, w), start_index=0, modality='Pose', total_frames=num_frame) num_person = max([len(x) for x in pose_results]) # Current PoseC3D models are trained on COCO-keypoints (17 keypoints) num_keypoint = 17 keypoint = np.zeros((num_person, num_frame, num_keypoint, 2), dtype=np.float16) keypoint_score = np.zeros((num_person, num_frame, num_keypoint), dtype=np.float16) for i, poses in enumerate(pose_results): for j, pose in enumerate(poses): pose = pose['keypoints'] keypoint[j, i] = pose[:, :2] keypoint_score[j, i] = pose[:, 2] fake_anno['keypoint'] = keypoint fake_anno['keypoint_score'] = keypoint_score imgs = test_pipeline(fake_anno)['imgs'][None] imgs = imgs.to(args.device) model = build_model(config.model) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() with torch.no_grad(): output = model(return_loss=False, imgs=imgs) action_idx = np.argmax(output) action_label = label_map[action_idx] pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, args.device) vis_frames = [ vis_pose_result(pose_model, frame_paths[i], pose_results[i]) for i in range(num_frame) ] for frame in vis_frames: cv2.putText(frame, action_label, (10, 30), FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE) cv2.imwrite('frame.jpg', vis_frames[0]) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24) vid.write_videofile(args.out_filename, remove_temp=True) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)
def main(): parser = ArgumentParser() parser.add_argument('--config', '-c', type=str, required=True) parser.add_argument('--checkpoint', '-w', type=str, required=True) parser.add_argument('--dataset_name', '-n', type=str, required=True) parser.add_argument('--data_dir', '-d', type=str, required=True) parser.add_argument('--predictions', '-p', type=str, required=True) parser.add_argument('--movements', '-m', type=str, required=True) parser.add_argument('--keypoints', '-k', type=str, required=True) parser.add_argument('--out_annotation', '-o', type=str, required=True) args = parser.parse_args() assert exists(args.config) assert exists(args.weights) assert exists(args.data_dir) assert exists(args.predictions) assert exists(args.movements) assert exists(args.keypoints) assert args.dataset_name is not None and args.dataset_name != '' assert args.out_annotation is not None and args.out_annotation != '' cfg = Config.fromfile(args.config) cfg = update_config(cfg, args, trg_name=args.dataset_name) cfg = propagate_root_dir(cfg, args.data_dir) dataset = build_dataset(cfg.data, 'train', dict(test_mode=True)) data_pipeline = Compose(dataset.pipeline.transforms[1:]) print('{} dataset:\n'.format(args.mode) + str(dataset)) model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, strict=False) model = MMDataParallel(model, device_ids=[0]) model.eval() annotation_path = join(args.data_dir, cfg.data.train.sources[0], cfg.data.train.ann_file) records = load_annotation(annotation_path) predictions = load_distributed_data(args.predictions, parse_predictions_file, 'txt') movements = load_distributed_data(args.movements, parse_movements_file, 'txt') hand_kpts = load_distributed_data(args.keypoints, parse_kpts_file, 'json') print('Loaded records: {}'.format(len(records))) invalid_stat = dict() all_candidates = [] ignore_candidates = get_ignore_candidates(records, IGNORE_LABELS) all_candidates += ignore_candidates static_candidates, static_invalids = get_regular_candidates( records, predictions, movements, hand_kpts, cfg.data.output.length, False, STATIC_LABELS, NEGATIVE_LABEL, NO_MOTION_LABEL, min_score=0.9, min_length=4, max_distance=1) all_candidates += static_candidates invalid_stat = update_stat(invalid_stat, static_invalids) print('Static candidates: {}'.format(len(static_candidates))) if len(invalid_stat) > 0: print('Ignored records after static analysis:') for ignore_label, ignore_values in invalid_stat.items(): print(' - {}: {}'.format(ignore_label.replace('_', ' '), len(ignore_values))) dynamic_candidates, dynamic_invalids = get_regular_candidates( records, predictions, movements, hand_kpts, cfg.data.output.length, True, DYNAMIC_LABELS, NEGATIVE_LABEL, NO_MOTION_LABEL, min_score=0.9, min_length=4, max_distance=1) all_candidates += dynamic_candidates invalid_stat = update_stat(invalid_stat, dynamic_invalids) print('Dynamic candidates: {}'.format(len(dynamic_candidates))) if len(invalid_stat) > 0: print('Ignored records after dynamic analysis:') for ignore_label, ignore_values in invalid_stat.items(): print(' - {}: {}'.format(ignore_label.replace('_', ' '), len(ignore_values))) fixed_records, fix_stat = find_best_match(all_candidates, model, dataset, NEGATIVE_LABEL) invalid_stat = update_stat(invalid_stat, fix_stat) print('Final records: {}'.format(len(fixed_records))) if len(invalid_stat) > 0: print('Final ignored records:') for ignore_label, ignore_values in invalid_stat.items(): print(' - {}: {}'.format(ignore_label.replace('_', ' '), len(ignore_values))) for ignored_record in ignore_values: print(' - {}'.format(ignored_record.path)) dump_records(fixed_records, args.out_annotation) print('Fixed annotation has been stored at: {}'.format( args.out_annotation))
def main(): args = parse_args() device = torch.device(args.device) use_frames = False if args.use_frames == "False": use_frames = False if args.use_frames == "True": use_frames = True model = init_recognizer(args.config, device=device, use_frames=use_frames) # Target FPGA Zynq UltraScale+ MPSoC ZCU104. Assuming clock frequency of 100 MHz. # The actual BRAM size is 11 Mbits (1.375 MBytes). This divided by the 18 Kbits size of each BRAM gives a total of 624 BRAM units. # The ZCU104 has also 27 Mbits (3.375 MBytes) of URAM. This divided by the 288 Kbits size of each URAM gives a total of 96 URAM units. # The ZCU104 has 20 GTH gigabit transceivers (16.3 Gb/s or 2.03 GB/s) on the PL-size feature_maps = ModelFeatureMaps(model=model, word_length=16, clock_freq=100, bram=624, dsp=1728) feature_maps.get_inter_feature_maps() random_img = np.random.randn(args.imshape[0], args.imshape[1], args.imshape[2]) data = dict(img_shape=None, modality="RGB", label=-1) # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if "SampleFrames" in step["type"]: step["num_clips"] = 1 sample_length = step["clip_len"] * step["num_clips"] data["num_clips"] = step["num_clips"] data["clip_len"] = step["clip_len"] pipeline_.remove(step) if step["type"] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) print(test_pipeline) assert sample_length > 0 data_in = [] for _ in range(data["clip_len"]): data_in.append(random_img) data["imgs"] = data_in if data["img_shape"] is None: data["img_shape"] = random_img.shape[:2] data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: data = scatter(data, [device])[0] with torch.no_grad(): scores = model(return_loss=False, **data)[0] feature_maps.get_info() feature_maps.get_conv_layers(file_name=args.model_name)
def main(): args = parse_args() args.is_rgb = args.modality == 'RGB' args.clip_len = 1 if args.is_rgb else 5 args.input_format = 'NCHW' if args.is_rgb else 'NCHW_Flow' rgb_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) flow_norm_cfg = dict(mean=[128, 128], std=[128, 128]) args.img_norm_cfg = rgb_norm_cfg if args.is_rgb else flow_norm_cfg args.f_tmpl = 'img_{:05d}.jpg' if args.is_rgb else 'flow_{}_{:05d}.jpg' args.in_channels = args.clip_len * (3 if args.is_rgb else 2) # max batch_size for one forward args.batch_size = 200 # define the data pipeline for Untrimmed Videos data_pipeline = [ dict(type='UntrimmedSampleFrames', clip_len=args.clip_len, frame_interval=args.frame_interval, start_index=0), dict(type='FrameSelector'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=256), dict(type='Normalize', **args.img_norm_cfg), dict(type='FormatShape', input_format=args.input_format), dict(type='Collect', keys=['imgs'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] data_pipeline = Compose(data_pipeline) #pdb.set_trace() # define TSN R50 model, the model is used as the feature extractor model_cfg = dict(type='Recognizer2D', backbone=dict(type='ResNet', depth=50, in_channels=args.in_channels, norm_eval=False), cls_head=dict(type='TSNHead', num_classes=400, in_channels=2048, spatial_type='avg', consensus=dict(type='AvgConsensus', dim=1)), test_cfg=dict(average_clips=None)) model = build_model(model_cfg) # load pretrained weight into the feature extractor state_dict = torch.load(args.ckpt)['state_dict'] #pdb.set_trace() model.load_state_dict(state_dict) model = model.cuda() model.eval() data = open(args.data_list).readlines() data = [x.strip() for x in data] data = data[args.part::args.total] # enumerate Untrimmed videos, extract feature from each of them prog_bar = mmcv.ProgressBar(len(data)) if not osp.exists(args.output_prefix): os.system(f'mkdir -p {args.output_prefix}') for item in data: frame_dir, length, label = item.split() output_file = osp.basename(frame_dir) + '.pkl' frame_dir = osp.join(args.data_prefix, frame_dir) output_file = osp.join(args.output_prefix, output_file) assert output_file.endswith('.pkl') length = int(length) # prepare a psuedo sample tmpl = dict(frame_dir=frame_dir, total_frames=length, filename_tmpl=args.f_tmpl, start_index=0, modality=args.modality) sample = data_pipeline(tmpl) imgs = sample['imgs'] shape = imgs.shape # the original shape should be N_seg * C * H * W, resize it to N_seg * # 1 * C * H * W so that the network return feature of each frame (No # score average among segments) imgs = imgs.reshape((shape[0], 1) + shape[1:]) imgs = imgs.cuda() def forward_data(model, data): # chop large data into pieces and extract feature from them results = [] start_idx = 0 num_clip = data.shape[0] while start_idx < num_clip: with torch.no_grad(): part = data[start_idx:start_idx + args.batch_size] feat = model.forward(part, return_loss=False) results.append(feat) start_idx += args.batch_size return np.concatenate(results) feat = forward_data(model, imgs) #pdb.set_trace() with open(output_file, 'wb') as fout: pickle.dump(feat, fout) prog_bar.update()
def inference_recognizer(model, video_path, label_path, use_frames=False, outputs=None, as_tensor=True): """Inference a video with the detector. Args: model (nn.Module): The loaded recognizer. video_path (str): The video file path/url or the rawframes directory path. If ``use_frames`` is set to True, it should be rawframes directory path. Otherwise, it should be video file path. label_path (str): The label file path. use_frames (bool): Whether to use rawframes as input. Default:False. outputs (list(str) | tuple(str) | str | None) : Names of layers whose outputs need to be returned, default: None. as_tensor (bool): Same as that in ``OutputHook``. Default: True. Returns: dict[tuple(str, float)]: Top-5 recognition result dict. dict[torch.tensor | np.ndarray]: Output feature maps from layers specified in `outputs`. """ if not (osp.exists(video_path) or video_path.startswith('http')): raise RuntimeError(f"'{video_path}' is missing") if osp.isfile(video_path) and use_frames: raise RuntimeError( f"'{video_path}' is a video file, not a rawframe directory") if osp.isdir(video_path) and not use_frames: raise RuntimeError( f"'{video_path}' is a rawframe directory, not a video file") if isinstance(outputs, str): outputs = (outputs, ) assert outputs is None or isinstance(outputs, (tuple, list)) cfg = model.cfg device = next(model.parameters()).device # model device # construct label map with open(label_path, 'r') as f: label = [line.strip() for line in f] # build the data pipeline test_pipeline = cfg.data.test.pipeline test_pipeline = Compose(test_pipeline) # prepare data if use_frames: filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg') modality = cfg.data.test.get('modality', 'RGB') start_index = cfg.data.test.get('start_index', 1) data = dict( frame_dir=video_path, total_frames=len(os.listdir(video_path)), # assuming files in ``video_path`` are all named with ``filename_tmpl`` # noqa: E501 label=-1, start_index=start_index, filename_tmpl=filename_tmpl, modality=modality) else: start_index = cfg.data.test.get('start_index', 0) data = dict(filename=video_path, label=-1, start_index=start_index, modality='RGB') data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] # forward the model with OutputHook(model, outputs=outputs, as_tensor=as_tensor) as h: with torch.no_grad(): scores = model(return_loss=False, **data)[0] returned_features = h.layer_outputs if outputs else None score_tuples = tuple(zip(label, scores)) score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) top5_label = score_sorted[:5] if outputs: return top5_label, returned_features return top5_label
def skeleton_based_stdet(args, label_map, human_detections, pose_results, num_frame, clip_len, frame_interval, h, w): window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) skeleton_config = mmcv.Config.fromfile(args.skeleton_config) num_class = max(label_map.keys()) + 1 # for AVA dataset (81) skeleton_config.model.cls_head.num_classes = num_class skeleton_pipeline = Compose(skeleton_config.test_pipeline) skeleton_stdet_model = build_model(skeleton_config.model) load_checkpoint(skeleton_stdet_model, args.skeleton_stdet_checkpoint, map_location='cpu') skeleton_stdet_model.to(args.device) skeleton_stdet_model.eval() skeleton_predictions = [] print('Performing SpatioTemporal Action Detection for each clip') prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp in timestamps: proposal = human_detections[timestamp - 1] if proposal.shape[0] == 0: # no people detected skeleton_predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) num_frame = len(frame_inds) # 30 pose_result = [pose_results[ind] for ind in frame_inds] skeleton_prediction = [] for i in range(proposal.shape[0]): # num_person skeleton_prediction.append([]) fake_anno = dict(frame_dict='', label=-1, img_shape=(h, w), origin_shape=(h, w), start_index=0, modality='Pose', total_frames=num_frame) num_person = 1 num_keypoint = 17 keypoint = np.zeros( (num_person, num_frame, num_keypoint, 2)) # M T V 2 keypoint_score = np.zeros( (num_person, num_frame, num_keypoint)) # M T V # pose matching person_bbox = proposal[i][:4] area = expand_bbox(person_bbox, h, w) for j, poses in enumerate(pose_result): # num_frame max_iou = float('-inf') index = -1 if len(poses) == 0: continue for k, per_pose in enumerate(poses): iou = cal_iou(per_pose['bbox'][:4], area) if max_iou < iou: index = k max_iou = iou keypoint[0, j] = poses[index]['keypoints'][:, :2] keypoint_score[0, j] = poses[index]['keypoints'][:, 2] fake_anno['keypoint'] = keypoint fake_anno['keypoint_score'] = keypoint_score skeleton_imgs = skeleton_pipeline(fake_anno)['imgs'][None] skeleton_imgs = skeleton_imgs.to(args.device) with torch.no_grad(): output = skeleton_stdet_model(return_loss=False, imgs=skeleton_imgs) output = output[0] for k in range(len(output)): # 81 if k not in label_map: continue if output[k] > args.action_score_thr: skeleton_prediction[i].append( (label_map[k], output[k])) skeleton_predictions.append(skeleton_prediction) prog_bar.update() return timestamps, skeleton_predictions
def inference_recognizer(model, video, outputs=None, as_tensor=True, **kwargs): """Inference a video with the recognizer. Args: model (nn.Module): The loaded recognizer. video (str | dict | ndarray): The video file path / url or the rawframes directory path / results dictionary (the input of pipeline) / a 4D array T x H x W x 3 (The input video). outputs (list(str) | tuple(str) | str | None) : Names of layers whose outputs need to be returned, default: None. as_tensor (bool): Same as that in ``OutputHook``. Default: True. Returns: dict[tuple(str, float)]: Top-5 recognition result dict. dict[torch.tensor | np.ndarray]: Output feature maps from layers specified in `outputs`. """ if 'use_frames' in kwargs: warnings.warn('The argument `use_frames` is deprecated PR #1191. ' 'Now you can use models trained with frames or videos ' 'arbitrarily. ') if 'label_path' in kwargs: warnings.warn('The argument `use_frames` is deprecated PR #1191. ' 'Now the label file is not needed in ' 'inference_recognizer. ') input_flag = None if isinstance(video, dict): input_flag = 'dict' elif isinstance(video, np.ndarray): assert len(video.shape) == 4, 'The shape should be T x H x W x C' input_flag = 'array' elif isinstance(video, str) and video.startswith('http'): input_flag = 'video' elif isinstance(video, str) and osp.exists(video): if osp.isfile(video): input_flag = 'video' if osp.isdir(video): input_flag = 'rawframes' else: raise RuntimeError('The type of argument video is not supported: ' f'{type(video)}') if isinstance(outputs, str): outputs = (outputs, ) assert outputs is None or isinstance(outputs, (tuple, list)) cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = cfg.data.test.pipeline # Alter data pipelines & prepare inputs if input_flag == 'dict': data = video if input_flag == 'array': modality_map = {2: 'Flow', 3: 'RGB'} modality = modality_map.get(video.shape[-1]) data = dict(total_frames=video.shape[0], label=-1, start_index=0, array=video, modality=modality) for i in range(len(test_pipeline)): if 'Decode' in test_pipeline[i]['type']: test_pipeline[i] = dict(type='ArrayDecode') if input_flag == 'video': data = dict(filename=video, label=-1, start_index=0, modality='RGB') if 'Init' not in test_pipeline[0]['type']: test_pipeline = [dict(type='OpenCVInit')] + test_pipeline else: test_pipeline[0] = dict(type='OpenCVInit') for i in range(len(test_pipeline)): if 'Decode' in test_pipeline[i]['type']: test_pipeline[i] = dict(type='OpenCVDecode') if input_flag == 'rawframes': filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg') modality = cfg.data.test.get('modality', 'RGB') start_index = cfg.data.test.get('start_index', 1) # count the number of frames that match the format of `filename_tmpl` # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$ # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$ pattern = f'^{filename_tmpl}$' if modality == 'Flow': pattern = pattern.replace('{}', 'x') pattern = pattern.replace( pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+') total_frames = len( list( filter(lambda x: re.match(pattern, x) is not None, os.listdir(video)))) data = dict(frame_dir=video, total_frames=total_frames, label=-1, start_index=start_index, filename_tmpl=filename_tmpl, modality=modality) if 'Init' in test_pipeline[0]['type']: test_pipeline = test_pipeline[1:] for i in range(len(test_pipeline)): if 'Decode' in test_pipeline[i]['type']: test_pipeline[i] = dict(type='RawFrameDecode') test_pipeline = Compose(test_pipeline) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] # forward the model with OutputHook(model, outputs=outputs, as_tensor=as_tensor) as h: with torch.no_grad(): scores = model(return_loss=False, **data)[0] returned_features = h.layer_outputs if outputs else None num_classes = scores.shape[-1] score_tuples = tuple(zip(range(num_classes), scores)) score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True) top5_label = score_sorted[:5] if outputs: return top5_label, returned_features return top5_label
def main(): parser = ArgumentParser() parser.add_argument('--config', type=str, required=True, help='Test config file path') parser.add_argument('--checkpoint', type=str, required=True, help='Checkpoint file') parser.add_argument('--data_dir', type=str, required=True, help='The dir with dataset') parser.add_argument('--out_dir', type=str, required=True, help='Output directory') parser.add_argument('--dataset', type=str, required=True, help='Dataset name') parser.add_argument('--gpus', default=1, type=int, help='GPU number used for annotating') parser.add_argument('--proc_per_gpu', default=2, type=int, help='Number of processes per GPU') parser.add_argument('--mode', choices=['train', 'val', 'test'], default='train') args = parser.parse_args() assert exists(args.config) assert exists(args.checkpoint) assert exists(args.data_dir) cfg = Config.fromfile(args.config) cfg = update_config(cfg, args, trg_name=args.dataset) cfg = propagate_root_dir(cfg, args.data_dir) dataset = build_dataset(cfg.data, args.mode, dict(test_mode=True)) data_pipeline = Compose(dataset.pipeline.transforms[1:]) print('{} dataset:\n'.format(args.mode) + str(dataset)) tasks = prepare_tasks(dataset, cfg.input_clip_length) print('Prepared tasks: {}'.format(sum([len(v) for v in tasks.values()]))) if not exists(args.out_dir): makedirs(args.out_dir) model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, strict=False) batch_size = 4 * cfg.data.videos_per_gpu if args.gpus == 1: model = MMDataParallel(model, device_ids=[0]) model.eval() process_tasks(tasks, dataset, model, args.out_dir, batch_size, cfg.input_clip_length, data_pipeline) else: raise NotImplementedError
return False return True if __name__ == '__main__': from pathlib import Path from mmcv import Config from tqdm import tqdm from mmaction.datasets.pipelines import Compose from railway.utils import utils import shutil import sys import time infer_pipeline_config = Config.fromfile('infer_pipeline.py') base_pipeline = Compose(infer_pipeline_config.base_decode_pipeline) hand_watch_pipleline = Compose(infer_pipeline_config.hand_watch_pipleline) input_dir = sys.argv[1] output_dir = sys.argv[2] video_paths = Path(input_dir).glob('**/*.mp4') utils.mkdir(output_dir) all_video_count = 0 still_count = 0 check_times = [] for video_path in tqdm(list(video_paths)): all_video_count += 1 video_path = str(video_path) _d = dict(filename=video_path, label=-1, start_index=0, modality='RGB') _base_data = base_pipeline(_d) img_data = _base_data['imgs']