def Model_Params(self, config_file, checkpoint_file, use_gpu=True): self.system_dict["local"]["config_file"] = config_file self.system_dict["local"]["checkpoint_file"] = checkpoint_file if (use_gpu): self.system_dict["local"]["model"] = init_recognizer( config_file, checkpoint_file, device='cuda') else: self.system_dict["local"]["model"] = init_recognizer( config_file, checkpoint_file, device='cpu')
def test_frames_inference_recognizer(): if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' rgb_model = init_recognizer(frame_config_file, None, device) flow_model = init_recognizer(flow_frame_config_file, None, device) with pytest.raises(RuntimeError): # video path doesn't exist inference_recognizer(rgb_model, 'missing_path') for ops in rgb_model.cfg.data.test.pipeline: if ops['type'] in ('TenCrop', 'ThreeCrop'): # Use CenterCrop to reduce memory in order to pass CI ops['type'] = 'CenterCrop' ops['crop_size'] = 224 for ops in flow_model.cfg.data.test.pipeline: if ops['type'] in ('TenCrop', 'ThreeCrop'): # Use CenterCrop to reduce memory in order to pass CI ops['type'] = 'CenterCrop' ops['crop_size'] = 224 top5_label = inference_recognizer(rgb_model, frames_path) scores = [item[1] for item in top5_label] assert len(top5_label) == 5 assert scores == sorted(scores, reverse=True) _, feat = inference_recognizer(flow_model, frames_path, outputs=('backbone', 'cls_head'), as_tensor=False) assert isinstance(feat, dict) assert 'backbone' in feat and 'cls_head' in feat assert isinstance(feat['backbone'], np.ndarray) assert isinstance(feat['cls_head'], np.ndarray) assert feat['backbone'].shape == (25, 2048, 7, 7) assert feat['cls_head'].shape == (1, 400) _, feat = inference_recognizer(rgb_model, frames_path, outputs=('backbone.layer3', 'backbone.layer3.1.conv1')) assert 'backbone.layer3.1.conv1' in feat and 'backbone.layer3' in feat assert isinstance(feat['backbone.layer3.1.conv1'], torch.Tensor) assert isinstance(feat['backbone.layer3'], torch.Tensor) assert feat['backbone.layer3'].size() == (25, 1024, 14, 14) assert feat['backbone.layer3.1.conv1'].size() == (25, 256, 14, 14)
def main(): args = parse_args() # assign the desired device. device = torch.device(args.device) # build the recognizer from a config file and checkpoint file model = init_recognizer( args.config, args.checkpoint, device=device, use_frames=args.use_frames) # test a single video or rawframes of a single video results = inference_recognizer( model, args.video, args.label, use_frames=args.use_frames) print('The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1]) if args.out_filename is not None: get_output( args.video, args.out_filename, results[0][0], font_size=args.font_size, font_color=args.font_color, resize_algorithm=args.resize_algorithm, use_frames=args.use_frames)
def main(): global frame_queue, threshold, sample_length, data, test_pipeline, model, \ out_file, video_path, device, input_step, label, result_queue args = parse_args() input_step = args.input_step threshold = args.threshold video_path = args.video out_file = args.out_file device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) show_results()
def main(): args = parse_args() args.device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) model = init_recognizer(cfg, args.checkpoint, device=args.device) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.data.test.pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 args.sample_length = sample_length args.test_pipeline = test_pipeline show_results(model, data, label, args)
def main(): global label, device, model, test_pipeline, \ camera, sample_length, average_size, threshold args = parse_args() device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) sample_length = args.sample_length average_size = args.average_size threshold = args.threshold with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: # Remove step to sample frames if sample_length == 0: sample_length = step['clip_len'] * step['num_clips'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 print('Press "Esc", "q" or "Q" to exit') predict_webcam_video()
def main(): args = parse_args() # build the model from a config file and a checkpoint file model = init_recognizer(args.config, args.checkpoint) # fuse conv and bn layers of the model fused_model = fuse_module(model) save_checkpoint(fused_model, args.out)
def test_inference_recognizer(): if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' model = init_recognizer(video_config_file, None, device) with pytest.raises(RuntimeError): # video path doesn't exist inference_recognizer(model, 'missing.mp4', label_path) with pytest.raises(RuntimeError): # ``video_path`` should be consist with the ``use_frames`` inference_recognizer(model, video_path, label_path, use_frames=True) with pytest.raises(RuntimeError): # ``video_path`` should be consist with the ``use_frames`` inference_recognizer(model, 'demo/', label_path) for ops in model.cfg.data.test.pipeline: if ops['type'] == 'TenCrop': # Use CenterCrop to reduce memory in order to pass CI ops['type'] = 'CenterCrop' top5_label = inference_recognizer(model, video_path, label_path) scores = [item[1] for item in top5_label] assert len(top5_label) == 5 assert scores == sorted(scores, reverse=True)
def test_init_recognizer(): with pytest.raises(TypeError): # config must be a filename or Config object init_recognizer(dict(config_file=None)) with pytest.raises(RuntimeError): # input data type should be consist with the dataset type init_recognizer(frame_config_file) with pytest.raises(RuntimeError): # input data type should be consist with the dataset type init_recognizer(video_config_file, use_frames=True) if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' model = init_recognizer(video_config_file, None, device) config = mmcv.Config.fromfile(video_config_file) config.model.backbone.pretrained = None isinstance(model, nn.Module) if torch.cuda.is_available(): assert next(model.parameters()).is_cuda is True else: assert next(model.parameters()).is_cuda is False assert model.cfg.model.backbone.pretrained is None
def main(): args = parse_args() # assign the desired device. device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) # build the recognizer from a config file and checkpoint file/url model = init_recognizer(cfg, args.checkpoint, device=device, use_frames=args.use_frames) # e.g. use ('backbone', ) to return backbone feature output_layer_names = None # test a single video or rawframes of a single video if output_layer_names: results, returned_feature = inference_recognizer( model, args.video, args.label, use_frames=args.use_frames, outputs=output_layer_names) else: results = inference_recognizer(model, args.video, args.label, use_frames=args.use_frames) print('The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1]) if args.out_filename is not None: if args.target_resolution is not None: if args.target_resolution[0] == -1: args.target_resolution[0] = None if args.target_resolution[1] == -1: args.target_resolution[1] = None args.target_resolution = tuple(args.target_resolution) else: args.target_resolution = (None, None) get_output(args.video, args.out_filename, results[0][0], fps=args.fps, font_size=args.font_size, font_color=args.font_color, target_resolution=args.target_resolution, resize_algorithm=args.resize_algorithm, use_frames=args.use_frames)
def test_init_recognizer(): with pytest.raises(TypeError): init_recognizer(dict(config_file=None)) if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' model = init_recognizer(config_file, None, device) config = mmcv.Config.fromfile(config_file) config.model.backbone.pretrained = None isinstance(model, nn.Module) if torch.cuda.is_available(): assert next(model.parameters()).is_cuda is True else: assert next(model.parameters()).is_cuda is False assert model.cfg.model.backbone.pretrained is None
def main(): args = parse_args() # assign the desired device. device = torch.device(args.device) # build the recognizer from a config file and checkpoint file model = init_recognizer(args.config, args.checkpoint, device=device) # test a single video results = inference_recognizer(model, args.video, args.label) print('The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1])
def main(): global frame_queue, camera, frame, results, threshold, sample_length, \ data, test_pipeline, model, device, average_size, label, \ result_queue, drawing_fps, inference_fps args = parse_args() average_size = args.average_size threshold = args.threshold drawing_fps = args.drawing_fps inference_fps = args.inference_fps device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) model = init_recognizer(cfg, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.data.test.pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 try: frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) pw = Thread(target=show_results, args=(), daemon=True) pr = Thread(target=inference, args=(), daemon=True) pw.start() pr.start() pw.join() except KeyboardInterrupt: pass
def main(): global frame_queue, camera, frame, results, threshold, sample_length, \ data, test_pipeline, model, device, average_size, label, result_queue args = parse_args() average_size = args.average_size threshold = args.threshold device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) camera = cv2.VideoCapture(args.camera_id) #camera = cv2.VideoCapture('/home/ww/tools/image/office/2020-12-10_14-54-03.mp4') camera.set(cv2.CAP_PROP_FRAME_WIDTH, 640) camera.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if 'SampleFrames' in step['type']: sample_length = step['clip_len'] * step['num_clips'] data['num_clips'] = step['num_clips'] data['clip_len'] = step['clip_len'] pipeline_.remove(step) if step['type'] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) assert sample_length > 0 try: frame_queue = deque(maxlen=sample_length) result_queue = deque(maxlen=1) pw = Thread(target=show_results, args=(), daemon=True) pr = Thread(target=inference, args=(), daemon=True) pw.start() pr.start() pw.join() except KeyboardInterrupt: pass
def test_inference_recognizer(): if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' model = init_recognizer(config_file, None, device) for ops in model.cfg.data.test.pipeline: if ops['type'] == 'TenCrop': # Use CenterCrop to reduce memory in order to pass CI ops['type'] = 'CenterCrop' top5_label = inference_recognizer(model, video_path, label_path) scores = [item[1] for item in top5_label] assert len(top5_label) == 5 assert scores == sorted(scores, reverse=True)
def main(): args = parse_args() device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) model = init_recognizer(cfg, args.checkpoint, device=device) if not args.audio.endswith('.npy'): raise NotImplementedError('Demo works on extracted audio features') results = inference_recognizer(model, args.audio) labels = open(args.label).readlines() labels = [x.strip() for x in labels] results = [(labels[k[0]], k[1]) for k in results] print('Scores:') for result in results: print(f'{result[0]}: ', result[1])
def main(): args = parse_args() # assign the desired device. device = torch.device(args.device) cfg = Config.fromfile(args.config) cfg.merge_from_dict(args.cfg_options) # build the recognizer from a config file and checkpoint file/url model = init_recognizer(cfg, args.checkpoint, device=device, use_frames=args.use_frames) inputs = build_inputs(model, args.video, use_frames=args.use_frames) gradcam = GradCAM(model, args.target_layer_name) results = gradcam(inputs) if args.out_filename is not None: try: from moviepy.editor import ImageSequenceClip except ImportError: raise ImportError('Please install moviepy to enable output file.') # frames_batches shape [B, T, H, W, 3], in RGB order frames_batches = (results[0] * 255.).numpy().astype(np.uint8) frames = frames_batches.reshape(-1, *frames_batches.shape[-3:]) frame_list = list(frames) frame_list = _resize_frames(frame_list, args.target_resolution, interpolation=args.resize_algorithm) video_clips = ImageSequenceClip(frame_list, fps=args.fps) out_type = osp.splitext(args.out_filename)[1][1:] if out_type == 'gif': video_clips.write_gif(args.out_filename) else: video_clips.write_videofile(args.out_filename, remove_temp=True)
def initialize(self, context): properties = context.system_properties self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu' self.device = torch.device(self.map_location + ':' + str(properties.get('gpu_id')) if torch.cuda. is_available() else self.map_location) self.manifest = context.manifest model_dir = properties.get('model_dir') serialized_file = self.manifest['model']['serializedFile'] checkpoint = os.path.join(model_dir, serialized_file) self.config_file = os.path.join(model_dir, 'config.py') mapping_file_path = osp.join(model_dir, 'label_map.txt') if not os.path.isfile(mapping_file_path): warnings.warn('Missing the label_map.txt file. ' 'Inference output will not include class name.') self.mapping = None else: lines = open(mapping_file_path).readlines() self.mapping = [x.strip() for x in lines] self.model = init_recognizer(self.config_file, checkpoint, self.device) self.initialized = True
parser.add_argument('--device', type=str, default='cuda:0', help='CPU/CUDA device option') parser.add_argument('--video', help='video file/url') parser.add_argument('--labels', help='dataset labels') args = parser.parse_args() #config file config_file = args.config # download the checkpoint from model zoo and put it in `checkpoints/` checkpoint_file = args.checkpoint # assign the desired device. device = args.device # 'cuda:0' or 'cpu' device = torch.device(device) # build the model from a config file and a checkpoint file model = init_recognizer(config_file, checkpoint_file, device=device) # test a single video and show the result: video = args.video labels = args.labels results = inference_recognizer(model, video, labels) # show the results print(f'The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1])
import torch from mmaction.apis import init_recognizer, inference_recognizer config_file = 'configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py' device = 'cuda:0' # or 'cpu' device = torch.device(device) model = init_recognizer(config_file, device=device) # inference the demo video inference_recognizer(model, 'demo/demo.mp4', 'demo/label_map_k400.txt')
def main(): args = parse_args() # assign the desired device. device = torch.device(args.device) # build the recognizer from a config file and checkpoint file/url model = init_recognizer(args.config, args.checkpoint, device=device, use_frames=args.use_frames) # e.g. use ('backbone', ) to return backbone feature output_layer_names = None # test a single video or rawframes of a single video if args.split_time is None: if output_layer_names: results, returned_feature = inference_recognizer( model, args.video, args.label, use_frames=args.use_frames, outputs=output_layer_names) else: results = inference_recognizer(model, args.video, args.label, use_frames=args.use_frames) print('The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1]) if args.out_filename is not None: if args.target_resolution is not None: if args.target_resolution[0] == -1: args.target_resolution[0] = None if args.target_resolution[1] == -1: args.target_resolution[1] = None args.target_resolution = tuple(args.target_resolution) else: args.target_resolution = (None, None) label_show = '' for result in results: label_show = label_show + result[0] + ': {:.2g}'.format( result[1]) + '\n' get_output(args.video, args.out_filename, label_show[:-1], fps=args.fps, font_size=args.font_size, font_color=args.font_color, target_resolution=args.target_resolution, resize_algorithm=args.resize_algorithm, use_frames=args.use_frames) if args.split_time is not None: #https://stackoverflow.com/questions/28884159/using-python-script-to-cut-long-videos-into-chunks-in-ffmpeg #https://nico-lab.net/segment_muxer_with_ffmpeg/ import re import math length_regexp = 'Duration: (\d{2}):(\d{2}):(\d{2})\.\d+,' re_length = re.compile(length_regexp) from subprocess import check_call, PIPE, Popen import shlex import os if args.split_time <= 0: print("Split length can't be 0") raise SystemExit p1 = Popen(["ffmpeg", "-i", args.video], stdout=PIPE, stderr=PIPE, universal_newlines=True) # get p1.stderr as input output = Popen(["grep", 'Duration'], stdin=p1.stderr, stdout=PIPE, universal_newlines=True) p1.stdout.close() matches = re_length.search(output.stdout.read()) if matches: video_length = int(matches.group(1)) * 3600 + \ int(matches.group(2)) * 60 + \ int(matches.group(3)) print("Video length in seconds: {}".format(video_length)) else: print("Can't determine video length.") raise SystemExit split_count = math.ceil(video_length / args.split_time) if split_count == 1: print("Video length is less than the target split length.") raise SystemExit fname = os.path.basename(args.video) dirname = os.path.dirname(args.video) fname_base, ext = fname.rsplit(".", 1) tmp_path = os.path.join(dirname, 'tmpdir') dummy_filenames = [] if not os.path.isdir(tmp_path): os.makedirs(tmp_path) #copied_fname = "{}.{}".format(os.path.join(tmp_path,fname_base), ext) #cmd = "ffmpeg -i {} -vf scale=640:360 -y {}".\ # format(args.video, copied_fname) #check_call(shlex.split(cmd), universal_newlines=True) #print(split_count) '''for n in range(split_count): split_start = args.split_time * n cmd = "ffmpeg -i {} -vcodec copy -strict -2 -ss {} -t {} -y {}-{}.{}".\ format(args.video, split_start, args.split_time, os.path.join(tmp_path,fname_base), n, ext) dummy_filenames.append("{}-{}.{}".format(os.path.join(tmp_path,fname_base), n, ext)) print("About to run: {}".format(cmd)) check_call(shlex.split(cmd), universal_newlines=True) tmp_fname = "{}-{}.{}".format(os.path.join(tmp_path,fname_base), n, ext)''' cmd = "ffmpeg -i {} -map 0 -c copy -flags +global_header -f segment -segment_time {} -y -segment_list {} -segment_format_options movflags=+faststart -reset_timestamps 1 {}-%02d.{}".\ format(args.video, args.split_time, os.path.join(tmp_path,'list_gen.txt'), os.path.join(tmp_path,fname_base), ext) print("About to run: {}".format(cmd)) check_call(shlex.split(cmd), universal_newlines=True) # cmd = "ffmpeg -i {} -vf scale=640:360 -y {}".\ # format(tmp_fname,tmp_fname) # print("About to run: {}".format(cmd)) # check_call(shlex.split(cmd), universal_newlines=True) with open(os.path.join(tmp_path, 'list_gen.txt'), 'r') as tmp_file: lines = tmp_file.readlines() for line in lines: dummy_filenames.append( os.path.join(tmp_path, line.replace('\n', ''))) #print(dummy_filenames) import pandas as pd with open(args.label, 'r') as f: label = [line.strip() for line in f] list_df = pd.DataFrame(columns=label, index=range(len(dummy_filenames))) #index_time = 0 for i, video_block in enumerate(dummy_filenames): video_block_out = os.path.join( os.path.dirname(video_block), 'out_' + os.path.basename(video_block)) output_layer_names = ('cls_head', ) if output_layer_names: results, returned_feature = inference_recognizer( model, video_block, args.label, use_frames=args.use_frames, outputs=output_layer_names) ret_feature = returned_feature['cls_head'].cpu().detach( ).numpy() #list_df = list_df.append( ret_feature, ignore_index=True ) #list_df = list_df.append(pd.DataFrame(ret_feature, columns=label, index= index_time) #import pdb;pdb.set_trace() list_df.iloc[i, :] = ret_feature[0, :len(label)] #index_time = index_time + args.split_time else: results = inference_recognizer(model, video_block, args.label, use_frames=args.use_frames) if args.out_filename is not None: if args.target_resolution is not None: if args.target_resolution[0] == -1: args.target_resolution[0] = None if args.target_resolution[1] == -1: args.target_resolution[1] = None args.target_resolution = tuple(args.target_resolution) else: args.target_resolution = (None, None) print('The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1]) label_show = '' for result in results: label_show = label_show + result[0] + ': {:.2g}'.format( result[1]) + '\n' get_output(video_path=video_block, out_filename=video_block_out, label=label_show[:-1], fps=args.fps, font_size=args.font_size, font_color=args.font_color, target_resolution=args.target_resolution, resize_algorithm=args.resize_algorithm, use_frames=args.use_frames) # concatnate files with open(os.path.join(tmp_path, 'list.txt'), 'w') as tmp_file: for video_block in dummy_filenames: tmp_file.write("file " + 'out_' + os.path.basename(video_block) + "\n") cmd = "ffmpeg -f concat -i {} -c copy -y {}".\ format(os.path.join(tmp_path,'list.txt'), args.out_filename) #cmd = "ffmpeg -i {} -c copy -segment_format_options movflags=+faststart {}".\ # format(os.path.join(tmp_path,'list.txt'), args.out_filename) print("About to run: {}".format(cmd)) check_call(shlex.split(cmd), universal_newlines=True) import shutil #import pdb #pdb.set_trace() shutil.rmtree(tmp_path) import matplotlib import matplotlib.pyplot as plt plt.figure() list_df.plot( y=label ) #, x=range(0, args.split_time*len(dummy_filenames),args.split_time) fig_outdir = os.path.dirname(args.out_filename) fig_outname = os.path.basename(args.out_filename) fig_outname = fig_outname.rsplit(".", 1)[0] plt.savefig(os.path.join(fig_outdir, fig_outname + '.png')) plt.close('all') list_df.to_csv(os.path.join(fig_outdir, fig_outname + '.csv'), index=False)
def test_video_inference_recognizer(): if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' model = init_recognizer(video_config_file, None, device) with pytest.raises(RuntimeError): # video path doesn't exist inference_recognizer(model, 'missing.mp4') for ops in model.cfg.data.test.pipeline: if ops['type'] in ('TenCrop', 'ThreeCrop'): # Use CenterCrop to reduce memory in order to pass CI ops['type'] = 'CenterCrop' top5_label = inference_recognizer(model, video_path) scores = [item[1] for item in top5_label] assert len(top5_label) == 5 assert scores == sorted(scores, reverse=True) _, feat = inference_recognizer(model, video_path, outputs=('backbone', 'cls_head'), as_tensor=False) assert isinstance(feat, dict) assert 'backbone' in feat and 'cls_head' in feat assert isinstance(feat['backbone'], np.ndarray) assert isinstance(feat['cls_head'], np.ndarray) assert feat['backbone'].shape == (25, 2048, 7, 7) assert feat['cls_head'].shape == (1, 400) _, feat = inference_recognizer(model, video_path, outputs=('backbone.layer3', 'backbone.layer3.1.conv1')) assert 'backbone.layer3.1.conv1' in feat and 'backbone.layer3' in feat assert isinstance(feat['backbone.layer3.1.conv1'], torch.Tensor) assert isinstance(feat['backbone.layer3'], torch.Tensor) assert feat['backbone.layer3'].size() == (25, 1024, 14, 14) assert feat['backbone.layer3.1.conv1'].size() == (25, 256, 14, 14) cfg_file = 'configs/recognition/slowfast/slowfast_r50_video_inference_4x16x1_256e_kinetics400_rgb.py' # noqa: E501 sf_model = init_recognizer(cfg_file, None, device) for ops in sf_model.cfg.data.test.pipeline: # Changes to reduce memory in order to pass CI if ops['type'] in ('TenCrop', 'ThreeCrop'): ops['type'] = 'CenterCrop' if ops['type'] == 'SampleFrames': ops['num_clips'] = 1 _, feat = inference_recognizer(sf_model, video_path, outputs=('backbone', 'cls_head')) assert isinstance(feat, dict) and isinstance(feat['backbone'], tuple) assert 'backbone' in feat and 'cls_head' in feat assert len(feat['backbone']) == 2 assert isinstance(feat['backbone'][0], torch.Tensor) assert isinstance(feat['backbone'][1], torch.Tensor) assert feat['backbone'][0].size() == (1, 2048, 4, 8, 8) assert feat['backbone'][1].size() == (1, 256, 32, 8, 8) assert feat['cls_head'].size() == (1, 400)
import argparse import os from mmaction.apis import init_recognizer, inference_recognizer parser = argparse.ArgumentParser(description="parsing...") parser.add_argument("--root", type=str, default="/home/administrator/Z/Algorithms/mmaction2/", help="mmaction2 root") args = parser.parse_args() config_file = os.path.join(args.root, 'configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py') config_file = os.path.join(args.root, config_file) # download the checkpoint from model zoo and put it in `checkpoints/` checkpoint_file = os.path.join(args.root, 'checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth') # assign the desired device. device = 'cuda:0' # or 'cpu' device = torch.device(device) # build the model from a config file and a checkpoint file model = init_recognizer(config_file, checkpoint_file, device=device, use_frames=True) # test a single video and show the result: video = os.path.join(args.root,'data/kinetics400/rawframes_video/...') labels = os.path.join(args.root,'demo/label_map_k400.txt') results = inference_recognizer(model, video, labels, use_frames=True) # show the results print(f'The top-5 labels with corresponding scores are:') for result in results: print(f'{result[0]}: ', result[1])
from mmaction.apis import inference_recognizer, init_recognizer import os # Choose to use a config and initialize the recognizer config = '/home/workspace/2021_capstone/mmaction2/configs/recognition/slowfast/custom.py' # Setup a checkpoint file to load checkpoint = '/home/workspace/2021_capstone/mmaction2/data_center/assult/best_top1_acc_epoch_185.pth' # Initialize the recognizer model = init_recognizer(config, checkpoint, device='cuda:0') # path_dir="../2021_capstone/mmaction2/data_center/fight_assault" path_dir = '.' # path_dir="../2021_capstone/mmaction2/data_center/fight_assault" normal_path = path_dir + "/test_normal" kicking_path = path_dir + "/test_kicking" punching_path = path_dir + "/test_punching" normal_file_list = os.listdir(normal_path) kicking_file_list = os.listdir(kicking_path) punching_file_list = os.listdir(punching_path) label = '/home/workspace/2021_capstone/mmaction2/demo/custom_map.txt' dir = [kicking_file_list, normal_file_list, punching_file_list] total_testSet = 0 for i in dir: total_testSet += len(i) kicking_cnt = 0 punching_cnt = 0 normal_cnt = 0 iter = 0 for i in dir:
def main(): args = parse_args() device = torch.device(args.device) use_frames = False if args.use_frames == "False": use_frames = False if args.use_frames == "True": use_frames = True model = init_recognizer(args.config, device=device, use_frames=use_frames) # Target FPGA Zynq UltraScale+ MPSoC ZCU104. Assuming clock frequency of 100 MHz. # The actual BRAM size is 11 Mbits (1.375 MBytes). This divided by the 18 Kbits size of each BRAM gives a total of 624 BRAM units. # The ZCU104 has also 27 Mbits (3.375 MBytes) of URAM. This divided by the 288 Kbits size of each URAM gives a total of 96 URAM units. # The ZCU104 has 20 GTH gigabit transceivers (16.3 Gb/s or 2.03 GB/s) on the PL-size feature_maps = ModelFeatureMaps(model=model, word_length=16, clock_freq=100, bram=624, dsp=1728) feature_maps.get_inter_feature_maps() random_img = np.random.randn(args.imshape[0], args.imshape[1], args.imshape[2]) data = dict(img_shape=None, modality="RGB", label=-1) # prepare test pipeline from non-camera pipeline cfg = model.cfg sample_length = 0 pipeline = cfg.test_pipeline pipeline_ = pipeline.copy() for step in pipeline: if "SampleFrames" in step["type"]: step["num_clips"] = 1 sample_length = step["clip_len"] * step["num_clips"] data["num_clips"] = step["num_clips"] data["clip_len"] = step["clip_len"] pipeline_.remove(step) if step["type"] in EXCLUED_STEPS: # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) print(test_pipeline) assert sample_length > 0 data_in = [] for _ in range(data["clip_len"]): data_in.append(random_img) data["imgs"] = data_in if data["img_shape"] is None: data["img_shape"] = random_img.shape[:2] data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: data = scatter(data, [device])[0] with torch.no_grad(): scores = model(return_loss=False, **data)[0] feature_maps.get_info() feature_maps.get_conv_layers(file_name=args.model_name)
def StrConverter(filename:str)->dict: key=['year','month','day','hour','min','sec'] filename=filename.replace(".",' ') filename=filename.replace(":",' ') data={ i:j for i,j in list(zip(key,filename.split())) } return data receive_path= 'receive_video/' path_dir = 'video_for_process/' db = "assult_candidate/" device = torch.device("cuda") # build the recognizer from a config file and checkpoint file/url config="../configs/recognition/slowfast/custom.py" checkpoint="../data_center/fight_assault/BinaryDataTree/tanos_lr_improve_checkpoints/epoch_70.pth" model = init_recognizer( config, checkpoint, device=device, ) label="../demo/custom_map.txt" while True: if not os.listdir(receive_path): continue os.system(f"mv {receive_path}* {path_dir}") file_list = os.listdir(path_dir) file_list.sort()#시간순 정렬 for i in file_list: results = inference_recognizer(model,path_dir+i,label) if results[0][0]=="abnormal" and results[0][1]>0.86: ''' 폭력 발생 db로 영상 보내야 함 and 처리 완료이므로 디렉토리에서 pop '''
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video, args.short_side) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) config.merge_from_dict(args.cfg_options) model = init_recognizer(config, args.checkpoint, args.device) # Load label_map label_map = [x.strip() for x in open(args.label_map).readlines()] # Get Human detection results det_results = detection_inference(args, frame_paths) torch.cuda.empty_cache() pose_results = pose_inference(args, frame_paths, det_results) torch.cuda.empty_cache() fake_anno = dict(frame_dir='', label=-1, img_shape=(h, w), original_shape=(h, w), start_index=0, modality='Pose', total_frames=num_frame) num_person = max([len(x) for x in pose_results]) # Current PoseC3D models are trained on COCO-keypoints (17 keypoints) num_keypoint = 17 keypoint = np.zeros((num_person, num_frame, num_keypoint, 2), dtype=np.float16) keypoint_score = np.zeros((num_person, num_frame, num_keypoint), dtype=np.float16) for i, poses in enumerate(pose_results): for j, pose in enumerate(poses): pose = pose['keypoints'] keypoint[j, i] = pose[:, :2] keypoint_score[j, i] = pose[:, 2] fake_anno['keypoint'] = keypoint fake_anno['keypoint_score'] = keypoint_score results = inference_recognizer(model, fake_anno) action_label = label_map[results[0][0]] pose_model = init_pose_model(args.pose_config, args.pose_checkpoint, args.device) vis_frames = [ vis_pose_result(pose_model, frame_paths[i], pose_results[i]) for i in range(num_frame) ] for frame in vis_frames: cv2.putText(frame, action_label, (10, 30), FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=24) vid.write_videofile(args.out_filename, remove_temp=True) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)