def test_ava_detector(): config = get_detector_cfg('ava/slowonly_kinetics_pretrained_r50_' '4x16x1_20e_ava_rgb.py') detector = build_detector(config.model) if torch.__version__ == 'parrots': if torch.cuda.is_available(): train_demo_inputs = generate_detector_demo_inputs(train=True, device='cuda') test_demo_inputs = generate_detector_demo_inputs(train=False, device='cuda') detector = detector.cuda() losses = detector(**train_demo_inputs) assert isinstance(losses, dict) # Test forward test with torch.no_grad(): _ = detector(**test_demo_inputs, return_loss=False) else: train_demo_inputs = generate_detector_demo_inputs(train=True) test_demo_inputs = generate_detector_demo_inputs(train=False) losses = detector(**train_demo_inputs) assert isinstance(losses, dict) # Test forward test with torch.no_grad(): _ = detector(**test_demo_inputs, return_loss=False)
def __init__(self, config_path, checkpoint, score_thr=0.5, label_dict=None, device='cuda:0'): self.score_thr = score_thr self.label_dict = label_dict self.device = device config = mmcv.Config.fromfile(config_path) self.img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in self.img_norm_cfg and 'to_bgr' in self.img_norm_cfg: to_bgr = self.img_norm_cfg.pop('to_bgr') self.img_norm_cfg['to_rgb'] = to_bgr self.img_norm_cfg['mean'] = np.array(self.img_norm_cfg['mean']) self.img_norm_cfg['std'] = np.array(self.img_norm_cfg['std']) # Get clip_len, frame_interval and calculate center index of each clip val_pipeline = config['val_pipeline'] sampler = [x for x in val_pipeline if x['type'] == 'SampleVia3Frames'][0] self.clip_len, self.frame_interval = sampler['clip_len'], sampler[ 'frame_interval'] self.window_size = self.clip_len * self.frame_interval assert self.clip_len % 2 == 0, 'We would like to have an even clip_len' config.model.backbone.pretrained = None self.model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(self.model, checkpoint, map_location=self.device) self.model.to(self.device) self.model.eval()
def __init__(self, config, checkpoint, device, score_thr, label_map_path): self.score_thr = score_thr # load model config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, checkpoint, map_location=device) model.to(device) model.eval() self.model = model self.device = device # init label map, aka class_id to class_name dict with open(label_map_path) as f: lines = f.readlines() lines = [x.strip().split(': ') for x in lines] self.label_map = {int(x[0]): x[1] for x in lines} try: if config['data']['train']['custom_classes'] is not None: self.label_map = { id + 1: self.label_map[cls] for id, cls in enumerate(config['data']['train'] ['custom_classes']) } except KeyError: pass
def main(): global args args = parse_args() cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) if args.out is None or not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') if osp.exists(args.out): outputs = mmcv.load(args.out) else: if args.launcher == 'none': raise NotImplementedError( "By default, we use distributed testing, so that launcher should be pytorch" ) else: distributed = True init_dist(args.launcher, **cfg.dist_params) model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) data_loader = build_dataloader(dataset, imgs_per_gpu=1, workers_per_gpu=1, dist=distributed, shuffle=False) load_checkpoint(model, args.checkpoint, map_location='cpu') find_unused_parameters = cfg.get('find_unused_parameters', False) model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) outputs = multiple_test(model, data_loader) rank, _ = get_dist_info() if rank == 0: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_type = args.eval if eval_type: print('Starting evaluate {}'.format(eval_type)) result_file = osp.join(args.out + '.csv') results2csv(dataset, outputs, result_file) ava_eval(result_file, eval_type, args.label_file, args.ann_file, args.exclude_file)
def main(): args = parse_args() if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = mmcv.Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.data.test.test_mode = True dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) if args.gpus == 1: model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) load_checkpoint(model, args.checkpoint, strict=True) model = MMDataParallel(model, device_ids=[0]) data_loader = build_dataloader( dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, num_gpus=1, dist=False, shuffle=False) outputs = single_test(model, data_loader) else: model_args = cfg.model.copy() model_args.update(train_cfg=None, test_cfg=cfg.test_cfg) model_type = getattr(detectors, model_args.pop('type')) outputs = parallel_test(model_type, model_args, args.checkpoint, dataset, _data_func, range(args.gpus), workers_per_gpu=args.proc_per_gpu) if args.out: print('writing results to {}'.format(args.out)) mmcv.dump(outputs, args.out) eval_type = args.eval if eval_type: print('Starting evaluate {}'.format(eval_type)) result_file = osp.join(args.out + '.csv') results2csv(dataset, outputs, result_file) ava_eval(result_file, eval_type, args.label_file, args.ann_file, args.exclude_file)
def __init__(self, config, checkpoint, device, score_thr, label_map_path): self.score_thr = score_thr # load model config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, checkpoint, map_location=device) model.to(device) model.eval() self.model = model self.device = device # init label map, aka class_id to class_name dict with open(label_map_path) as f: lines = f.readlines() lines = [x.strip().split(': ') for x in lines] self.label_map = {int(x[0]): x[1] for x in lines}
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # update configs according to CLI args if args.work_dir is not None: cfg.work_dir = args.work_dir if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.gpus = args.gpus if cfg.checkpoint_config is not None: # save mmaction version in checkpoints as meta data cfg.checkpoint_config.meta = dict(mmact_version=__version__, config=cfg.text) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # init logger before other steps logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) train_dataset = get_trimmed_dataset(cfg.data.train) train_network(model, train_dataset, cfg, distributed=distributed, validate=args.validate, logger=logger)
def main(): args = parse_args() frame_paths, original_frames = frame_extraction(args.video) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) config.merge_from_dict(args.cfg_options) val_pipeline = config.data.val.pipeline sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Load label_map label_map = load_label_map(args.label_map) try: if config['data']['train']['custom_classes'] is not None: label_map = { id + 1: label_map[cls] for id, cls in enumerate(config['data']['train'] ['custom_classes']) } except KeyError: pass # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model try: # In our spatiotemporal detection demo, different actions should have # the same number of bboxes. config['model']['test_cfg']['rcnn']['action_thr'] = .0 except KeyError: pass config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') assert len(timestamps) == len(human_detections) prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp, proposal in zip(timestamps, human_detections): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model(return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) prog_bar.update() results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) tmp_frame_dir = osp.dirname(frame_paths[0]) shutil.rmtree(tmp_frame_dir)
def main(): args = parse_args() # frame_paths, original_frames = frame_extraction(args.video) #folder path video_path = args.video frame_paths = sorted([osp.join(video_path, x) for x in os.listdir(video_path)]) num_frame = len(frame_paths) # h, w, _ = original_frames[0].shape frame = cv2.imread(frame_paths[0]) h, w, _ = frame.shape # Load label_map label_map = load_label_map(args.label_map) # resize frames to shortside 256 new_w, new_h = mmcv.rescale_size((w, h), (256, np.Inf)) # frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] w_ratio, h_ratio = new_w / w, new_h / h # Get clip_len, frame_interval and calculate center index of each clip config = mmcv.Config.fromfile(args.config) val_pipeline = config['val_pipeline'] sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] # if num_frame < clip_len * frame_interval: # frame_interval=max(int(num_frame/clip_len)-1,0) window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Get Human detection results center_frames = [frame_paths[ind - 1] for ind in timestamps] human_detections = detection_inference(args, center_frames) for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) # Get img_norm_cfg img_norm_cfg = config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model config.model.backbone.pretrained = None model = build_detector(config.model, test_cfg=config.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location=args.device) model.to(args.device) model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') for timestamp, proposal in tqdm(zip(timestamps, human_detections)): if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [mmcv.imresize(cv2.imread(frame_paths[ind]), (new_w, new_h)).astype(np.float32) for ind in frame_inds] # imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = model( return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) results = [] for human_detection, prediction in zip(human_detections, predictions): results.append(pack_result(human_detection, prediction, new_h, new_w)) def dense_timestamps(timestamps, n): """Make it nx frames.""" old_frame_interval = (timestamps[1] - timestamps[0]) start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start return new_frame_inds.astype(np.int) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ cv2.imread(frame_paths[i - 1]) for i in dense_timestamps(timestamps, dense_n) ] print('Performing visualization') vis_frames = visualize(frames, results) vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], fps=args.output_fps) vid.write_videofile(args.out_filename) #save image target_dir = osp.join('./tmp/test') os.makedirs(target_dir, exist_ok=True) frame_tmpl = osp.join(target_dir, 'img_%06d.jpg') vid.write_images_sequence(frame_tmpl,fps=args.output_fps)
def rgb_based_stdet(args, frames, label_map, human_detections, w, h, new_w, new_h, w_ratio, h_ratio): rgb_stdet_config = mmcv.Config.fromfile(args.rgb_stdet_config) rgb_stdet_config.merge_from_dict(args.cfg_options) val_pipeline = rgb_stdet_config.data.val.pipeline sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] assert clip_len % 2 == 0, 'We would like to have an even clip_len' window_size = clip_len * frame_interval num_frame = len(frames) timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize) # Get img_norm_cfg img_norm_cfg = rgb_stdet_config['img_norm_cfg'] if 'to_rgb' not in img_norm_cfg and 'to_bgr' in img_norm_cfg: to_bgr = img_norm_cfg.pop('to_bgr') img_norm_cfg['to_rgb'] = to_bgr img_norm_cfg['mean'] = np.array(img_norm_cfg['mean']) img_norm_cfg['std'] = np.array(img_norm_cfg['std']) # Build STDET model try: # In our spatiotemporal detection demo, different actions should have # the same number of bboxes. rgb_stdet_config['model']['test_cfg']['rcnn']['action_thr'] = .0 except KeyError: pass rgb_stdet_config.model.backbone.pretrained = None rgb_stdet_model = build_detector(rgb_stdet_config.model, test_cfg=rgb_stdet_config.get('test_cfg')) load_checkpoint(rgb_stdet_model, args.rgb_stdet_checkpoint, map_location='cpu') rgb_stdet_model.to(args.device) rgb_stdet_model.eval() predictions = [] print('Performing SpatioTemporal Action Detection for each clip') prog_bar = mmcv.ProgressBar(len(timestamps)) for timestamp in timestamps: proposal = human_detections[timestamp - 1] if proposal.shape[0] == 0: predictions.append(None) continue start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds] _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] # THWC -> CTHW -> 1CTHW input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] input_tensor = torch.from_numpy(input_array).to(args.device) with torch.no_grad(): result = rgb_stdet_model( return_loss=False, img=[input_tensor], img_metas=[[dict(img_shape=(new_h, new_w))]], proposals=[[proposal]]) result = result[0] prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(len(result)): # 80 if i + 1 not in label_map: continue for j in range(proposal.shape[0]): if result[i][j, 4] > args.action_score_thr: prediction[j].append((label_map[i + 1], result[i][j, 4])) predictions.append(prediction) prog_bar.update() return timestamps, predictions