def main(): args = build_argparser().parse_args() log.info('Initializing Inference Engine...') ie = IECore() config_user_specified, config_min_latency = get_plugin_configs( args.device, args.num_streams, args.num_threads) labels_map = None if args.labels: with open(args.labels, 'r') as f: labels_map = [x.strip() for x in f] log.info('Loading network...') completed_request_results = {} modes = cycle(Modes) prev_mode = mode = next(modes) log.info('Using {} mode'.format(mode.name)) mode_info = {mode: ModeInfo()} exceptions = [] detectors = { Modes.USER_SPECIFIED: Detector(ie, args.model, device=args.device, plugin_config=config_user_specified, results=completed_request_results, max_num_requests=args.num_infer_requests, labels_map=labels_map, keep_aspect_ratio_resize=args.keep_aspect_ratio, caught_exceptions=exceptions), Modes.MIN_LATENCY: Detector(ie, args.model, device=args.device.split(':')[-1].split(',')[0], plugin_config=config_min_latency, results=completed_request_results, max_num_requests=1, labels_map=labels_map, keep_aspect_ratio_resize=args.keep_aspect_ratio, caught_exceptions=exceptions) } try: input_stream = int(args.input) except ValueError: input_stream = args.input cap = cv2.VideoCapture(input_stream) wait_key_time = 1 next_frame_id = 0 next_frame_id_to_show = 0 input_repeats = 0 log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) print( "To switch between min_latency/user_specified modes, press TAB key in the output window" ) palette = ColorPalette(len(labels_map) if labels_map is not None else 100) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8))) while (cap.isOpened() \ or completed_request_results \ or len(detectors[mode].empty_requests) < len(detectors[mode].requests)) \ and not exceptions: if next_frame_id_to_show in completed_request_results: frame_meta, raw_outputs = completed_request_results.pop( next_frame_id_to_show) objects = detectors[mode].postprocess(raw_outputs, frame_meta) frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(objects) and args.raw_output_message: log.info(' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ') origin_im_size = frame.shape[:-1] presenter.drawGraphs(frame) for obj in objects: if obj.score > args.prob_threshold: xmin = max(int(obj.xmin), 0) ymin = max(int(obj.ymin), 0) xmax = min(int(obj.xmax), origin_im_size[1]) ymax = min(int(obj.ymax), origin_im_size[0]) class_id = int(obj.class_id) color = palette[class_id] det_label = labels_map[class_id] if labels_map and len( labels_map) >= class_id else str(class_id) if args.raw_output_message: log.info('{:^9} | {:10f} | {:4} | {:4} | {:4} | {:4} '. format(det_label, obj.score, xmin, ymin, xmax, ymax)) cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2) cv2.putText(frame, '#{} {:.1%}'.format(det_label, obj.score), (xmin, ymin - 7), cv2.FONT_HERSHEY_COMPLEX, 0.6, color, 1) mode_message = '{} mode'.format(mode.name) put_highlighted_text(frame, mode_message, (10, int(origin_im_size[0] - 20)), cv2.FONT_HERSHEY_COMPLEX, 0.75, (10, 10, 200), 2) next_frame_id_to_show += 1 if prev_mode == mode: mode_info[mode].frames_count += 1 elif len(completed_request_results) == 0: mode_info[prev_mode].last_end_time = perf_counter() prev_mode = mode # Frames count is always zero if mode has just been switched (i.e. prev_mode != mode). if mode_info[mode].frames_count != 0: fps_message = 'FPS: {:.1f}'.format(mode_info[mode].frames_count / \ (perf_counter() - mode_info[mode].last_start_time)) mode_info[mode].latency_sum += perf_counter() - start_time latency_message = 'Latency: {:.1f} ms'.format((mode_info[mode].latency_sum / \ mode_info[mode].frames_count) * 1e3) # Draw performance stats over frame. put_highlighted_text(frame, fps_message, (15, 20), cv2.FONT_HERSHEY_COMPLEX, 0.75, (200, 10, 10), 2) put_highlighted_text(frame, latency_message, (15, 50), cv2.FONT_HERSHEY_COMPLEX, 0.75, (200, 10, 10), 2) if not args.no_show: cv2.imshow('Detection Results', frame) key = cv2.waitKey(wait_key_time) ESC_KEY = 27 TAB_KEY = 9 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break # Switch mode. # Disable mode switch if the previous switch has not been finished yet. if key == TAB_KEY and mode_info[mode].frames_count > 0: mode = next(modes) detectors[prev_mode].await_all() mode_info[prev_mode].last_end_time = perf_counter() mode_info[mode] = ModeInfo() log.info('Using {} mode'.format(mode.name)) else: presenter.handleKey(key) elif detectors[mode].empty_requests and cap.isOpened(): start_time = perf_counter() ret, frame = cap.read() if not ret: if input_repeats < args.loop or args.loop < 0: cap.open(input_stream) input_repeats += 1 else: cap.release() continue detectors[mode](frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: detectors[mode].await_any() if exceptions: raise exceptions[0] for exec_net in detectors.values(): exec_net.await_all() for mode_value, mode_stats in mode_info.items(): log.info('') log.info('Mode: {}'.format(mode_value.name)) end_time = mode_stats.last_end_time if mode_stats.last_end_time is not None \ else perf_counter() log.info('FPS: {:.1f}'.format(mode_stats.frames_count / \ (end_time - mode_stats.last_start_time))) log.info('Latency: {:.1f} ms'.format((mode_stats.latency_sum / \ mode_stats.frames_count) * 1e3)) print(presenter.reportMeans())
def main(): args = build_argparser().parse_args() if args.architecture_type != 'yolov4' and args.anchors: log.warning( 'The "--anchors" option works only for "-at==yolov4". Option will be omitted' ) if args.architecture_type != 'yolov4' and args.masks: log.warning( 'The "--masks" option works only for "-at==yolov4". Option will be omitted' ) if args.architecture_type not in ['nanodet', 'nanodet-plus' ] and args.num_classes: log.warning( 'The "--num_classes" option works only for "-at==nanodet" and "-at==nanodet-plus". Option will be omitted' ) cap = open_images_capture(args.input, args.loop) if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter( create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests, model_parameters={'input_layouts': args.layout}) elif args.adapter == 'ovms': model_adapter = OVMSAdapter(args.model) configuration = { 'resize_type': args.resize_type, 'mean_values': args.mean_values, 'scale_values': args.scale_values, 'reverse_input_channels': args.reverse_input_channels, 'path_to_labels': args.labels, 'confidence_threshold': args.prob_threshold, 'input_size': args.input_size, # The CTPN specific 'num_classes': args.num_classes, # The NanoDet and NanoDetPlus specific } model = DetectionModel.create_model(args.architecture_type, model_adapter, configuration) model.log_layers_info() detector_pipeline = AsyncPipeline(model) next_frame_id = 0 next_frame_id_to_show = 0 palette = ColorPalette(len(model.labels) if model.labels else 100) metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() presenter = None output_transform = None video_writer = cv2.VideoWriter() while True: if detector_pipeline.callback_exceptions: raise detector_pipeline.callback_exceptions[0] # Process all completed requests results = detector_pipeline.get_result(next_frame_id_to_show) if results: objects, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(objects) and args.raw_output_message: print_raw_results(objects, model.labels, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_detections(frame, objects, palette, model.labels, output_transform) render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Detection Results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) continue if detector_pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") # Submit for inference detector_pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request detector_pipeline.await_any() detector_pipeline.await_all() if detector_pipeline.callback_exceptions: raise detector_pipeline.callback_exceptions[0] # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = detector_pipeline.get_result(next_frame_id_to_show) objects, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(objects) and args.raw_output_message: print_raw_results(objects, model.labels, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_detections(frame, objects, palette, model.labels, output_transform) render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Detection Results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), detector_pipeline.preprocess_metrics.get_latency(), detector_pipeline.inference_metrics.get_latency(), detector_pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) target_bgr = open_images_capture(args.target_bgr, loop=True) if args.target_bgr else None if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter(create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests, model_parameters = {'input_layouts': args.layout}) elif args.adapter == 'ovms': model_adapter = OVMSAdapter(args.model) labels = ['__background__', 'person'] if args.labels is None else load_labels(args.labels) assert len(labels), 'The file with class labels is empty' configuration = { 'confidence_threshold': args.prob_threshold, 'resize_type': args.resize_type } model, need_bgr_input = get_model(model_adapter, configuration, args) input_bgr = open_images_capture(args.background, False).read() if need_bgr_input else None person_id = -1 for i, label in enumerate(labels): if label == 'person': person_id = i break assert person_id >= 0, 'Person class did not find in labels list.' model.log_layers_info() pipeline = AsyncPipeline(model) next_frame_id = 0 next_frame_id_to_show = 0 metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() presenter = None output_transform = None video_writer = cv2.VideoWriter() while True: if pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() bgr = target_bgr.read() if target_bgr is not None else None if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter(args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), tuple(output_resolution)): raise RuntimeError("Can't open video writer") # Submit for inference data = {'src': frame, 'bgr': input_bgr} if input_bgr is not None else frame pipeline.submit_data(data, next_frame_id, {'frame': frame, 'start_time': start_time}) next_frame_id += 1 else: # Wait for empty request pipeline.await_any() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process all completed requests results = pipeline.get_result(next_frame_id_to_show) if results: objects, frame_meta = results if args.raw_output_message: print_raw_results(objects, next_frame_id_to_show) frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() frame = render_results(frame, objects, output_resolution, bgr, person_id, args.blur_bgr, args.show_with_original_frame) render_metrics.update(rendering_start_time) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit-1): video_writer.write(frame) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Background subtraction results', frame) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break presenter.handleKey(key) pipeline.await_all() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = pipeline.get_result(next_frame_id_to_show) objects, frame_meta = results if args.raw_output_message: print_raw_results(objects, next_frame_id_to_show, model.labels) frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() frame = render_results(frame, objects, output_resolution, bgr, person_id, args.blur_bgr, args.show_with_original_frame) render_metrics.update(rendering_start_time) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit-1): video_writer.write(frame) if not args.no_show: cv2.imshow('Background subtraction results', frame) cv2.waitKey(1) metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), pipeline.preprocess_metrics.get_latency(), pipeline.inference_metrics.get_latency(), pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) next_frame_id = 1 next_frame_id_to_show = 0 metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() video_writer = cv2.VideoWriter() if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter( create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests, model_parameters={'input_layouts': args.layout}) elif args.adapter == 'ovms': model_adapter = OVMSAdapter(args.model) start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") model = Deblurring(model_adapter, preload=False) model.reshape(frame.shape) model.log_layers_info() pipeline = AsyncPipeline(model) pipeline.submit_data(frame, 0, {'frame': frame, 'start_time': start_time}) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(frame.shape[1] / 4), round(frame.shape[0] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (2 * frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") while True: if pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: break # Submit for inference pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request pipeline.await_any() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process all completed requests results = pipeline.get_result(next_frame_id_to_show) if results: result_frame, frame_meta = results input_frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() if input_frame.shape != result_frame.shape: input_frame = cv2.resize( input_frame, (result_frame.shape[1], result_frame.shape[0])) final_image = cv2.hconcat([input_frame, result_frame]) render_metrics.update(rendering_start_time) presenter.drawGraphs(final_image) metrics.update(start_time, final_image) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(final_image) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Deblurring Results', final_image) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break presenter.handleKey(key) pipeline.await_all() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = pipeline.get_result(next_frame_id_to_show) result_frame, frame_meta = results input_frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() if input_frame.shape != result_frame.shape: input_frame = cv2.resize( input_frame, (result_frame.shape[1], result_frame.shape[0])) final_image = cv2.hconcat([input_frame, result_frame]) render_metrics.update(rendering_start_time) presenter.drawGraphs(final_image) metrics.update(start_time, final_image) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(final_image) if not args.no_show: cv2.imshow('Deblurring Results', final_image) key = cv2.waitKey(1) metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), pipeline.preprocess_metrics.get_latency(), pipeline.inference_metrics.get_latency(), pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): parser = argparse.ArgumentParser(description='Whiteboard inpainting demo') parser.add_argument('-i', '--input', required=True, help='Required. Path to a video file or a device node of a web-camera.') parser.add_argument('--loop', default=False, action='store_true', help='Optional. Enable reading the input in a loop.') parser.add_argument('-o', '--output', required=False, help='Optional. Name of the output file(s) to save.') parser.add_argument('-limit', '--output_limit', required=False, default=1000, type=int, help='Optional. Number of frames to store in output. ' 'If 0 is set, all frames are stored.') parser.add_argument('-m_i', '--m_instance_segmentation', type=str, required=False, help='Required. Path to the instance segmentation model.') parser.add_argument('-m_s', '--m_semantic_segmentation', type=str, required=False, help='Required. Path to the semantic segmentation model.') parser.add_argument('-t', '--threshold', type=float, default=0.6, help='Optional. Threshold for person instance segmentation model.') parser.add_argument('--no_show', help="Optional. Don't show output.", action='store_true') parser.add_argument('-d', '--device', type=str, default='CPU', help='Optional. Specify a target device to infer on. CPU, GPU, HDDL or MYRIAD is ' 'acceptable. The demo will look for a suitable plugin for the device specified.') parser.add_argument('-l', '--cpu_extension', type=str, default=None, help='MKLDNN (CPU)-targeted custom layers. Absolute \ path to a shared library with the kernels impl.') parser.add_argument('-u', '--utilization_monitors', default='', type=str, help='Optional. List of monitors to show initially.') args = parser.parse_args() cap = open_images_capture(args.input, args.loop) if cap.get_type() not in ('VIDEO', 'CAMERA'): raise RuntimeError("The input should be a video file or a numeric camera ID") if bool(args.m_instance_segmentation) == bool(args.m_semantic_segmentation): raise ValueError('Set up exactly one of segmentation models: ' '--m_instance_segmentation or --m_semantic_segmentation') labels_dir = Path(__file__).resolve().parents[3] / 'data/dataset_classes' mouse = MouseClick() if not args.no_show: cv2.namedWindow(WINNAME) cv2.setMouseCallback(WINNAME, mouse.get_points) log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) core = Core() model_path = args.m_instance_segmentation if args.m_instance_segmentation else args.m_semantic_segmentation log.info('Reading model {}'.format(model_path)) if args.m_instance_segmentation: labels_file = str(labels_dir / 'coco_80cl_bkgr.txt') segmentation = MaskRCNN(core, args.m_instance_segmentation, labels_file, args.threshold, args.device, args.cpu_extension) elif args.m_semantic_segmentation: labels_file = str(labels_dir / 'cityscapes_19cl_bkgr.txt') segmentation = SemanticSegmentation(core, args.m_semantic_segmentation, labels_file, args.threshold, args.device, args.cpu_extension) log.info('The model {} is loaded to {}'.format(model_path, args.device)) metrics = PerformanceMetrics() video_writer = cv2.VideoWriter() black_board = False frame_number = 0 key = -1 start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") out_frame_size = (frame.shape[1], frame.shape[0] * 2) output_frame = np.full((frame.shape[0], frame.shape[1], 3), 255, dtype='uint8') presenter = monitors.Presenter(args.utilization_monitors, 20, (out_frame_size[0] // 4, out_frame_size[1] // 16)) if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), out_frame_size): raise RuntimeError("Can't open video writer") while frame is not None: mask = None detections = segmentation.get_detections([frame]) expand_mask(detections, frame.shape[1] // 27) if len(detections[0]) > 0: mask = detections[0][0][2] for i in range(1, len(detections[0])): mask = cv2.bitwise_or(mask, detections[0][i][2]) if mask is not None: mask = np.stack([mask, mask, mask], axis=-1) else: mask = np.zeros(frame.shape, dtype='uint8') clear_frame = remove_background(frame, invert_colors=not black_board) output_frame = np.where(mask, output_frame, clear_frame) merged_frame = np.vstack([frame, output_frame]) merged_frame = cv2.resize(merged_frame, out_frame_size) metrics.update(start_time, merged_frame) if video_writer.isOpened() and (args.output_limit <= 0 or frame_number <= args.output_limit-1): video_writer.write(merged_frame) presenter.drawGraphs(merged_frame) if not args.no_show: cv2.imshow(WINNAME, merged_frame) key = check_pressed_keys(key) if key == 27: # 'Esc' break if key == ord('i'): # catch pressing of key 'i' black_board = not black_board output_frame = 255 - output_frame else: presenter.handleKey(key) if mouse.crop_available: x0, x1 = min(mouse.points[0][0], mouse.points[1][0]), \ max(mouse.points[0][0], mouse.points[1][0]) y0, y1 = min(mouse.points[0][1], mouse.points[1][1]), \ max(mouse.points[0][1], mouse.points[1][1]) x1, y1 = min(x1, output_frame.shape[1] - 1), min(y1, output_frame.shape[0] - 1) board = output_frame[y0: y1, x0: x1, :] if board.shape[0] > 0 and board.shape[1] > 0: cv2.namedWindow('Board', cv2.WINDOW_KEEPRATIO) cv2.imshow('Board', board) frame_number += 1 start_time = perf_counter() frame = cap.read() metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) with open(args.labels, 'rt') as labels_file: class_labels = labels_file.read().splitlines() assert len(class_labels), 'The file with class labels is empty' # Plugin initialization for specified device and load extensions library if specified. log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) core = Core() if args.cpu_extension and 'CPU' in args.device: core.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) image_input, image_info_input, ( n, c, h, w), model_type, output_names, postprocessor = check_model(model) args.no_keep_aspect_ratio = model_type == 'yolact' or args.no_keep_aspect_ratio compiled_model = core.compile_model(model, args.device) infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.no_track: tracker = None else: tracker = StaticIOUTracker() if args.delay: delay = args.delay else: delay = int(cap.get_type() in ('VIDEO', 'CAMERA')) frames_processed = 0 metrics = PerformanceMetrics() visualizer = Visualizer(class_labels, show_boxes=args.show_boxes, show_scores=args.show_scores) video_writer = cv2.VideoWriter() start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") out_frame_size = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 45, (round(out_frame_size[0] / 4), round(out_frame_size[1] / 8))) if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), out_frame_size): raise RuntimeError("Can't open video writer") while frame is not None: if args.no_keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) input_image_info = np.asarray( [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32) # Run the model. feed_dict = {image_input: input_image} if image_info_input: feed_dict[image_info_input] = input_image_info infer_request.infer(feed_dict) outputs = { name: infer_request.get_tensor(name).data[:] for name in output_names } # Parse detection results of the current request scores, classes, boxes, masks = postprocessor(outputs, scale_x, scale_y, *frame.shape[:2], h, w, args.prob_threshold) if len(boxes) and args.raw_output_message: log.debug( ' -------------------------- Frame # {} -------------------------- ' .format(frames_processed)) log.debug( ' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ' ) for box, cls, score in zip(boxes, classes, scores): log.debug( '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} ' .format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, presenter, masks, masks_tracks_ids) metrics.update(start_time, frame) frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(frame) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) if not args.no_show: key = cv2.waitKey(delay) esc_code = 27 if key == esc_code: break presenter.handleKey(key) start_time = perf_counter() frame = cap.read() metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() if args.labels: with open(args.labels) as f: labels = [line.strip() for line in f] else: labels = None ie = IECore() if 'MYRIAD' in args.device: myriad_config = {'VPU_HW_STAGES_OPTIMIZATION': 'YES'} ie.set_config(myriad_config, 'MYRIAD') if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') decoder_target_device = 'CPU' if args.device != 'CPU': encoder_target_device = args.device else: encoder_target_device = decoder_target_device encoder_xml = args.m_encoder encoder_bin = args.m_encoder.replace('.xml', '.bin') models = [ IEModel(encoder_xml, encoder_bin, ie, encoder_target_device, num_requests=(3 if args.device == 'MYRIAD' else 1)) ] if args.architecture_type == 'en-de': if args.m_decoder is None: raise RuntimeError( 'No decoder for encoder-decoder model type (-m_de) provided') decoder_xml = args.m_decoder decoder_bin = args.m_decoder.replace('.xml', '.bin') models.append( IEModel(decoder_xml, decoder_bin, ie, decoder_target_device, num_requests=2)) seq_size = models[1].input_size[1] elif args.architecture_type == 'en-mean': models.append(DummyDecoder(num_requests=2)) seq_size = args.decoder_seq_size elif args.architecture_type == 'i3d-rgb': seq_size = models[0].input_size[2] presenter = monitors.Presenter(args.utilization_monitors, 70) result_presenter = ResultRenderer( no_show=args.no_show, presenter=presenter, output=args.output, limit=args.output_limit, labels=labels, label_smoothing_window=args.label_smoothing) cap = open_images_capture(args.input, args.loop) run_pipeline(cap, args.architecture_type, models, result_presenter.render_frame, seq_size=seq_size, fps=cap.fps()) print(presenter.reportMeans())
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) frame_processor = FrameProcessor(args) frame_num = 0 metrics = PerformanceMetrics() presenter = None output_transform = None input_crop = None if args.crop_size[0] > 0 and args.crop_size[1] > 0: input_crop = np.array(args.crop_size) elif not (args.crop_size[0] == 0 and args.crop_size[1] == 0): raise ValueError('Both crop height and width should be positive') video_writer = cv2.VideoWriter() while True: start_time = perf_counter() frame = cap.read() if frame is None: if frame_num == 0: raise ValueError("Can't read an image from the input") break if input_crop: frame = center_crop(frame, input_crop) if frame_num == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter(args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") detections = frame_processor.process(frame) presenter.drawGraphs(frame) frame = draw_detections(frame, frame_processor, detections, output_transform) metrics.update(start_time, frame) frame_num += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frame_num <= args.output_limit): video_writer.write(frame) if not args.no_show: cv2.imshow('Face recognition demo', frame) key = cv2.waitKey(1) # Quit if key in {ord('q'), ord('Q'), 27}: break presenter.handleKey(key) metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)
def main(): """ Main function. """ log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() class_map = load_class_map(args.class_map) assert class_map is not None ie_core = load_ie_core(args.device, args.cpu_extension) person_detector = PersonDetector(args.detection_model, args.device, ie_core, num_requests=2, output_shape=DETECTOR_OUTPUT_SHAPE) action_recognizer = ActionRecognizer(args.action_model, args.device, ie_core, num_requests=2, img_scale=ACTION_IMAGE_SCALE, num_classes=len(class_map)) person_tracker = Tracker(person_detector, TRACKER_SCORE_THRESHOLD, TRACKER_IOU_THRESHOLD) video_stream = VideoStream(args.input, ACTION_NET_INPUT_FPS, action_recognizer.input_length) video_stream.start() visualizer = Visualizer(VISUALIZER_TRG_FPS) visualizer.register_window('Demo') presenter = monitors.Presenter(args.utilization_monitors) samples_library = None if args.samples_dir is not None and os.path.exists(args.samples_dir): visualizer.register_window('Gesture library') visualizer.start() library_queue = visualizer.get_queue('Gesture library') samples_library = VideoLibrary(args.samples_dir, SAMPLES_MAX_WINDOW_SIZE, list(class_map.values()), library_queue, SAMPLES_TRG_FPS) samples_library.start() else: visualizer.start() last_caption = None active_object_id = -1 tracker_labels_map = dict() tracker_labels = set() frames_processed = 0 start_time = time.perf_counter() while True: frame = video_stream.get_live_frame() batch = video_stream.get_batch() if frame is None or batch is None: break if frames_processed == 0: video_writer = cv2.VideoWriter() if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), video_stream.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") detections, tracker_labels_map = person_tracker.add_frame( frame, len(OBJECT_IDS), tracker_labels_map) if detections is None: active_object_id = -1 last_caption = None if len(detections) == 1: active_object_id = 0 if active_object_id >= 0: cur_det = [det for det in detections if det.id == active_object_id] if len(cur_det) != 1: active_object_id = -1 last_caption = None continue recognizer_result = action_recognizer(batch, cur_det[0].roi.reshape(-1)) if recognizer_result is not None: action_class_id = np.argmax(recognizer_result) action_class_label = \ class_map[action_class_id] if class_map is not None else action_class_id action_class_score = np.max(recognizer_result) if action_class_score > args.action_threshold: last_caption = 'Last gesture: {} '.format( action_class_label) end_time = time.perf_counter() elapsed_time = end_time - start_time start_time = end_time presenter.drawGraphs(frame) if active_object_id >= 0: current_fps = 1.0 / elapsed_time cv2.putText(frame, 'FPS: {:.2f}'.format(current_fps), (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2) if detections is not None: tracker_labels = set(det.id for det in detections) for det in detections: roi_color = (0, 255, 0) if active_object_id == det.id else (128, 128, 128) border_width = 2 if active_object_id == det.id else 1 person_roi = det.roi[0] cv2.rectangle(frame, (person_roi[0], person_roi[1]), (person_roi[2], person_roi[3]), roi_color, border_width) cv2.putText(frame, str(det.id), (person_roi[0] + 10, person_roi[1] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.8, roi_color, 2) if last_caption is not None: cv2.putText(frame, last_caption, (10, frame.shape[0] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(frame) if args.no_show: continue visualizer.put_queue(frame, 'Demo') key = visualizer.get_key() if key == 27: # esc break elif key == ord(' '): # space active_object_id = -1 last_caption = None elif key == 13: # enter last_caption = None elif key in OBJECT_IDS: # 0-9 local_bbox_id = int(chr(key)) if local_bbox_id in tracker_labels: active_object_id = local_bbox_id else: presenter.handleKey(key) if samples_library is not None: if key == ord('f'): # forward samples_library.next() elif key == ord('b'): # backward samples_library.prev() if samples_library is not None: samples_library.release() visualizer.release() video_stream.release() print(presenter.reportMeans())
def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() log.info("Creating Inference Engine...") ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, "CPU") # Read IR log.info("Loading network") net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin") if "CPU" in args.device: supported_layers = ie.query_network(net, "CPU") not_supported_layers = [l for l in net.layers.keys() if l not in supported_layers] if len(not_supported_layers) != 0: log.error("Following layers are not supported by the plugin for specified device {}:\n {}". format(args.device, ', '.join(not_supported_layers))) log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l " "or --cpu_extension command line argument") sys.exit(1) img_info_input_blob = None feed_dict = {} for blob_name in net.input_info: if len(net.input_info[blob_name].input_data.shape) == 4: input_blob = blob_name elif len(net.input_info[blob_name].input_data.shape) == 2: img_info_input_blob = blob_name else: raise RuntimeError("Unsupported {}D input layer '{}'. Only 2D and 4D input layers are supported" .format(len(net.input_info[blob_name].input_data.shape), blob_name)) output_postprocessor = get_output_postprocessor(net) log.info("Loading IR to the plugin...") exec_net = ie.load_network(network=net, num_requests=2, device_name=args.device) # Read and pre-process input image n, c, h, w = net.input_info[input_blob].input_data.shape if img_info_input_blob: feed_dict[img_info_input_blob] = [h, w, 1] if args.input == 'cam': input_stream = 0 else: input_stream = args.input cap = cv2.VideoCapture(input_stream) assert cap.isOpened(), "Can't open " + str(input_stream) if args.labels: with open(args.labels, 'r') as f: labels_map = [x.strip() for x in f] else: labels_map = None cur_request_id = 0 next_request_id = 1 log.info("Starting inference in async mode...") is_async_mode = True render_time = 0 if is_async_mode: ret, frame = cap.read() frame_h, frame_w = frame.shape[:2] presenter = monitors.Presenter(args.utilization_monitors, 45, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8))) print("To close the application, press 'CTRL+C' here or switch to the output window and press ESC key") print("To switch between sync/async modes, press TAB key in the output window") while cap.isOpened(): if is_async_mode: ret, next_frame = cap.read() else: ret, frame = cap.read() if ret: frame_h, frame_w = frame.shape[:2] if not ret: break # abandons the last frame in case of async_mode # Main sync point: # in the truly Async mode we start the NEXT infer request, while waiting for the CURRENT to complete # in the regular mode we start the CURRENT request and immediately wait for it's completion inf_start = time.time() if is_async_mode: in_frame = cv2.resize(next_frame, (w, h)) in_frame = in_frame.transpose((2, 0, 1)) # Change data layout from HWC to CHW in_frame = in_frame.reshape((n, c, h, w)) feed_dict[input_blob] = in_frame exec_net.start_async(request_id=next_request_id, inputs=feed_dict) else: in_frame = cv2.resize(frame, (w, h)) in_frame = in_frame.transpose((2, 0, 1)) # Change data layout from HWC to CHW in_frame = in_frame.reshape((n, c, h, w)) feed_dict[input_blob] = in_frame exec_net.start_async(request_id=cur_request_id, inputs=feed_dict) if exec_net.requests[cur_request_id].wait(-1) == 0: inf_end = time.time() det_time = inf_end - inf_start # Parse detection results of the current request for obj in output_postprocessor(exec_net.requests[cur_request_id].output_blobs): # Draw only objects when probability more than specified threshold if obj[2] > args.prob_threshold: xmin = int(obj[3] * frame_w) ymin = int(obj[4] * frame_h) xmax = int(obj[5] * frame_w) ymax = int(obj[6] * frame_h) class_id = int(obj[1]) # Draw box and label\class_id color = (min(class_id * 12.5, 255), min(class_id * 7, 255), min(class_id * 5, 255)) cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2) det_label = labels_map[class_id] if labels_map else str(class_id) cv2.putText(frame, det_label + ' ' + str(round(obj[2] * 100, 1)) + ' %', (xmin, ymin - 7), cv2.FONT_HERSHEY_COMPLEX, 0.6, color, 1) # Draw performance stats inf_time_message = "Inference time: N\A for async mode" if is_async_mode else \ "Inference time: {:.3f} ms".format(det_time * 1000) render_time_message = "OpenCV rendering time: {:.3f} ms".format(render_time * 1000) async_mode_message = "Async mode is on. Processing request {}".format(cur_request_id) if is_async_mode else \ "Async mode is off. Processing request {}".format(cur_request_id) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) cv2.putText(frame, async_mode_message, (10, int(frame_h - 20)), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) presenter.drawGraphs(frame) render_start = time.time() if not args.no_show: cv2.imshow("Detection Results", frame) render_end = time.time() render_time = render_end - render_start if is_async_mode: cur_request_id, next_request_id = next_request_id, cur_request_id frame = next_frame frame_h, frame_w = frame.shape[:2] if not args.no_show: key = cv2.waitKey(1) if key == 27: break if (9 == key): is_async_mode = not is_async_mode log.info("Switched to {} mode".format("async" if is_async_mode else "sync")) else: presenter.handleKey(key) print(presenter.reportMeans())
def main(): """ Main function. """ log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() place_recognition = PlaceRecognition(args.model, args.device, args.gallery_folder, args.cpu_extension, args.gallery_size) cap = open_images_capture(args.input, args.loop) compute_embeddings_times = [] search_in_gallery_times = [] frames_processed = 0 presenter = monitors.Presenter(args.utilization_monitors, 0) video_writer = cv2.VideoWriter() while True: frame = cap.read() if frame is None: if frames_processed == 0: raise ValueError("Can't read an image from the input") break elapsed, probe_embedding = time_elapsed( place_recognition.compute_embedding, frame) compute_embeddings_times.append(elapsed) elapsed, (sorted_indexes, distances) = time_elapsed( place_recognition.search_in_gallery, probe_embedding) search_in_gallery_times.append(elapsed) image, key = visualize( frame, [str(place_recognition.impaths[i]) for i in sorted_indexes], distances[sorted_indexes], place_recognition.input_size, np.mean(compute_embeddings_times), np.mean(search_in_gallery_times), imshow_delay=3, presenter=presenter, no_show=args.no_show) if frames_processed == 0: if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (image.shape[1], image.shape[0])): raise RuntimeError("Can't open video writer") frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(image) if key == 27: break print(presenter.reportMeans())
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) # Plugin initialization for specified device and load extensions library if specified log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # Read IR log.info('Reading Proposal model {}'.format(args.model_pnet)) p_net = core.read_model(args.model_pnet) if len(p_net.inputs) != 1: raise RuntimeError("Pnet supports only single input topologies") if len(p_net.outputs) != 2: raise RuntimeError("Pnet supports two output topologies") log.info('Reading Refine model {}'.format(args.model_rnet)) r_net = core.read_model(args.model_rnet) if len(r_net.inputs) != 1: raise RuntimeError("Rnet supports only single input topologies") if len(r_net.outputs) != 2: raise RuntimeError("Rnet supports two output topologies") log.info('Reading Output model {}'.format(args.model_onet)) o_net = core.read_model(args.model_onet) if len(o_net.inputs) != 1: raise RuntimeError("Onet supports only single input topologies") if len(o_net.outputs) != 3: raise RuntimeError("Onet supports three output topologies") pnet_input_tensor_name = p_net.inputs[0].get_any_name() rnet_input_tensor_name = r_net.inputs[0].get_any_name() onet_input_tensor_name = o_net.inputs[0].get_any_name() for node in p_net.outputs: if node.shape[1] == 2: pnet_cls_name = node.get_any_name() elif node.shape[1] == 4: pnet_roi_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Pnet") for node in r_net.outputs: if node.shape[1] == 2: rnet_cls_name = node.get_any_name() elif node.shape[1] == 4: rnet_roi_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Rnet") for node in o_net.outputs: if node.shape[1] == 2: onet_cls_name = node.get_any_name() elif node.shape[1] == 4: onet_roi_name = node.get_any_name() elif node.shape[1] == 10: onet_pts_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Onet") next_frame_id = 0 metrics = PerformanceMetrics() presenter = None video_writer = cv2.VideoWriter() is_loaded_before = False while True: start_time = perf_counter() origin_image = cap.read() if origin_image is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: presenter = monitors.Presenter(args.utilization_monitors, 55, (round(origin_image.shape[1] / 4), round(origin_image.shape[0] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (origin_image.shape[1], origin_image.shape[0])): raise RuntimeError("Can't open video writer") next_frame_id += 1 rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB) oh, ow, _ = rgb_image.shape scales = utils.calculate_scales(rgb_image) # ************************************* # Pnet stage # ************************************* pnet_res = [] for i, scale in enumerate(scales): hs = int(oh * scale) ws = int(ow * scale) image = preprocess_image(rgb_image, ws, hs) p_net.reshape({ pnet_input_tensor_name: PartialShape([1, 3, ws, hs]) }) # Change weidth and height of input blob compiled_pnet = core.compile_model(p_net, args.device) infer_request_pnet = compiled_pnet.create_infer_request() if i == 0 and not is_loaded_before: log.info("The Proposal model {} is loaded to {}".format( args.model_pnet, args.device)) infer_request_pnet.infer(inputs={pnet_input_tensor_name: image}) p_res = { name: infer_request_pnet.get_tensor(name).data[:] for name in {pnet_roi_name, pnet_cls_name} } pnet_res.append(p_res) image_num = len(scales) rectangles = [] for i in range(image_num): roi = pnet_res[i][pnet_roi_name] cls = pnet_res[i][pnet_cls_name] _, _, out_h, out_w = cls.shape out_side = max(out_h, out_w) rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side, 1 / scales[i], ow, oh, score_threshold[0], iou_threshold[0]) rectangles.extend(rectangle) rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou') # Rnet stage if len(rectangles) > 0: r_net.reshape({ rnet_input_tensor_name: PartialShape([len(rectangles), 3, 24, 24]) }) # Change batch size of input blob compiled_rnet = core.compile_model(r_net, args.device) infer_request_rnet = compiled_rnet.create_infer_request() if not is_loaded_before: log.info("The Refine model {} is loaded to {}".format( args.model_rnet, args.device)) rnet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 24, 24) rnet_input.extend(crop_img) infer_request_rnet.infer( inputs={rnet_input_tensor_name: rnet_input}) rnet_res = { name: infer_request_rnet.get_tensor(name).data[:] for name in {rnet_roi_name, rnet_cls_name} } roi = rnet_res[rnet_roi_name] cls = rnet_res[rnet_cls_name] rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh, score_threshold[1], iou_threshold[2]) # Onet stage if len(rectangles) > 0: o_net.reshape({ onet_input_tensor_name: PartialShape([len(rectangles), 3, 48, 48]) }) # Change batch size of input blob compiled_onet = core.compile_model(o_net, args.device) infer_request_onet = compiled_onet.create_infer_request() if not is_loaded_before: log.info("The Output model {} is loaded to {}".format( args.model_onet, args.device)) is_loaded_before = True onet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 48, 48) onet_input.extend(crop_img) infer_request_onet.infer( inputs={onet_input_tensor_name: onet_input}) onet_res = { name: infer_request_onet.get_tensor(name).data[:] for name in {onet_roi_name, onet_cls_name, onet_pts_name} } roi = onet_res[onet_roi_name] cls = onet_res[onet_cls_name] pts = onet_res[onet_pts_name] rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow, oh, score_threshold[2], iou_threshold[3]) # display results for rectangle in rectangles: # Draw detected boxes cv2.putText(origin_image, 'confidence: {:.2f}'.format(rectangle[4]), (int(rectangle[0]), int(rectangle[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0)) cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])), (int(rectangle[2]), int(rectangle[3])), (255, 0, 0), 1) # Draw landmarks for i in range(5, 15, 2): cv2.circle(origin_image, (int(rectangle[i + 0]), int(rectangle[i + 1])), 2, (0, 255, 0)) metrics.update(start_time, origin_image) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id <= args.output_limit): video_writer.write(origin_image) if not args.no_show: cv2.imshow('MTCNN Results', origin_image) key = cv2.waitKey(1) if key in {ord('q'), ord('Q'), 27}: break presenter.handleKey(key) metrics.log_total()
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter( create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests) elif args.adapter == 'remote': log.info('Reading model {}'.format(args.model)) serving_config = {"address": "localhost", "port": 9000} model_adapter = RemoteAdapter(args.model, serving_config) model = SegmentationModel.create_model(args.architecture_type, model_adapter, {'path_to_labels': args.labels}) if args.architecture_type == 'segmentation': visualizer = SegmentationVisualizer(args.colors) if args.architecture_type == 'salient_object_detection': visualizer = SaliencyMapVisualizer() model.log_layers_info() pipeline = AsyncPipeline(model) next_frame_id = 0 next_frame_id_to_show = 0 metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() presenter = None output_transform = None video_writer = cv2.VideoWriter() only_masks = args.only_masks while True: if pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") # Submit for inference pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request pipeline.await_any() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process all completed requests results = pipeline.get_result(next_frame_id_to_show) if results: objects, frame_meta = results if args.raw_output_message: print_raw_results(objects, next_frame_id_to_show, model.labels) frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() frame = render_segmentation(frame, objects, visualizer, output_transform, only_masks) render_metrics.update(rendering_start_time) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Segmentation Results', frame) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break if key == 9: only_masks = not only_masks presenter.handleKey(key) pipeline.await_all() # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = pipeline.get_result(next_frame_id_to_show) while results is None: results = pipeline.get_result(next_frame_id_to_show) objects, frame_meta = results if args.raw_output_message: print_raw_results(objects, next_frame_id_to_show, model.labels) frame = frame_meta['frame'] start_time = frame_meta['start_time'] rendering_start_time = perf_counter() frame = render_segmentation(frame, objects, visualizer, output_transform, only_masks) render_metrics.update(rendering_start_time) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Segmentation Results', frame) key = cv2.waitKey(1) metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), pipeline.preprocess_metrics.get_latency(), pipeline.inference_metrics.get_latency(), pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter( create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests) elif args.adapter == 'remote': log.info('Reading model {}'.format(args.model)) serving_config = {"address": "localhost", "port": 9000} model_adapter = RemoteAdapter(args.model, serving_config) model = MonoDepthModel(model_adapter) model.log_layers_info() pipeline = AsyncPipeline(model) next_frame_id = 0 next_frame_id_to_show = 0 metrics = PerformanceMetrics() presenter = None output_transform = None video_writer = cv2.VideoWriter() while True: if pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") # Submit for inference pipeline.submit_data(frame, next_frame_id, {'start_time': start_time}) next_frame_id += 1 else: # Wait for empty request pipeline.await_any() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process all completed requests results = pipeline.get_result(next_frame_id_to_show) if results: depth_map, frame_meta = results depth_map = apply_color_map(depth_map, output_transform) start_time = frame_meta['start_time'] presenter.drawGraphs(depth_map) metrics.update(start_time, depth_map) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(depth_map) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow(DEMO_NAME, depth_map) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break presenter.handleKey(key) pipeline.await_all() # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = pipeline.get_result(next_frame_id_to_show) while results is None: results = pipeline.get_result(next_frame_id_to_show) depth_map, frame_meta = results depth_map = apply_color_map(depth_map, output_transform) start_time = frame_meta['start_time'] presenter.drawGraphs(depth_map) metrics.update(start_time, depth_map) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(depth_map) if not args.no_show: cv2.imshow(DEMO_NAME, depth_map) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break presenter.handleKey(key) metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) delay = int(cap.get_type() in {'VIDEO', 'CAMERA'}) if args.adapter == 'openvino': plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter( create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests) elif args.adapter == 'ovms': model_adapter = OVMSAdapter(args.model) config = { 'mean_values': args.mean_values, 'scale_values': args.scale_values, 'reverse_input_channels': args.reverse_input_channels, 'topk': args.topk, 'path_to_labels': args.labels } model = Classification(model_adapter, config) model.log_layers_info() async_pipeline = AsyncPipeline(model) next_frame_id = 0 next_frame_id_to_show = 0 metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() presenter = None output_transform = None video_writer = cv2.VideoWriter() ESC_KEY = 27 key = -1 while True: if async_pipeline.callback_exceptions: raise async_pipeline.callback_exceptions[0] # Process all completed requests results = async_pipeline.get_result(next_frame_id_to_show) if results: classifications, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if args.raw_output_message: print_raw_results(classifications, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_labels(frame, classifications, output_transform) if delay or args.no_show: render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Classification Results', frame) key = cv2.waitKey(delay) # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) continue if async_pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") # Submit for inference async_pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request async_pipeline.await_any() async_pipeline.await_all() if key not in {ord('q'), ord('Q'), ESC_KEY}: # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = async_pipeline.get_result(next_frame_id_to_show) while results is None: results = async_pipeline.get_result(next_frame_id_to_show) classifications, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if args.raw_output_message: print_raw_results(classifications, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_labels(frame, classifications, output_transform) if delay or args.no_show: render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Classification Results', frame) key = cv2.waitKey(delay) # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) if delay or args.no_show: metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), async_pipeline.preprocess_metrics.get_latency(), async_pipeline.inference_metrics.get_latency(), async_pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): """ Main function. """ log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() img_retrieval = ImageRetrieval(args.model, args.device, args.gallery, INPUT_SIZE, args.cpu_extension) frames = RoiDetectorOnVideo(args.i) compute_embeddings_times = [] search_in_gallery_times = [] positions = [] presenter = monitors.Presenter(args.utilization_monitors, 0) for image, view_frame in frames: position = None sorted_indexes = [] if image is not None: image = central_crop(image, divide_by=5, shift=1) elapsed, probe_embedding = time_elapsed( img_retrieval.compute_embedding, image) compute_embeddings_times.append(elapsed) elapsed, (sorted_indexes, distances) = time_elapsed( img_retrieval.search_in_gallery, probe_embedding) search_in_gallery_times.append(elapsed) sorted_classes = [ img_retrieval.gallery_classes[i] for i in sorted_indexes ] if args.ground_truth is not None: position = sorted_classes.index( img_retrieval.text_label_to_class_id[args.ground_truth]) positions.append(position) log.info("ROI detected, found: %d, position of target: %d", sorted_classes[0], position) else: log.info("ROI detected, found: %s", sorted_classes[0]) key = visualize( view_frame, position, [img_retrieval.impaths[i] for i in sorted_indexes], distances[sorted_indexes] if position is not None else None, img_retrieval.input_size, np.mean(compute_embeddings_times), np.mean(search_in_gallery_times), imshow_delay=3, presenter=presenter, no_show=args.no_show) if key == 27: break print(presenter.reportMeans()) if positions: compute_metrics(positions)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) next_frame_id = 1 next_frame_id_to_show = 0 metrics = PerformanceMetrics() render_metrics = PerformanceMetrics() video_writer = cv2.VideoWriter() plugin_config = get_user_config(args.device, args.num_streams, args.num_threads) model_adapter = OpenvinoAdapter(create_core(), args.model, device=args.device, plugin_config=plugin_config, max_num_requests=args.num_infer_requests) start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") config = { 'target_size': args.tsize, 'aspect_ratio': frame.shape[1] / frame.shape[0], 'prob_threshold': args.prob_threshold, 'padding_mode': 'center' if args.architecture_type == 'higherhrnet' else None, # the 'higherhrnet' and 'ae' specific 'delta': 0.5 if 'higherhrnet' else None, # the 'higherhrnet' and 'ae' specific } model = ImageModel.create_model(ARCHITECTURES[args.architecture_type], model_adapter, config) model.log_layers_info() hpe_pipeline = AsyncPipeline(model) hpe_pipeline.submit_data(frame, 0, { 'frame': frame, 'start_time': start_time }) output_transform = OutputTransform(frame.shape[:2], args.output_resolution) if args.output_resolution: output_resolution = output_transform.new_resolution else: output_resolution = (frame.shape[1], frame.shape[0]) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(output_resolution[0] / 4), round(output_resolution[1] / 8))) if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), output_resolution): raise RuntimeError("Can't open video writer") while True: if hpe_pipeline.callback_exceptions: raise hpe_pipeline.callback_exceptions[0] # Process all completed requests results = hpe_pipeline.get_result(next_frame_id_to_show) if results: (poses, scores), frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(poses) and args.raw_output_message: print_raw_results(poses, scores, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_poses(frame, poses, args.prob_threshold, output_transform) render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) next_frame_id_to_show += 1 if not args.no_show: cv2.imshow('Pose estimation results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) continue if hpe_pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: break # Submit for inference hpe_pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request hpe_pipeline.await_any() hpe_pipeline.await_all() # Process completed requests for next_frame_id_to_show in range(next_frame_id_to_show, next_frame_id): results = hpe_pipeline.get_result(next_frame_id_to_show) while results is None: results = hpe_pipeline.get_result(next_frame_id_to_show) (poses, scores), frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(poses) and args.raw_output_message: print_raw_results(poses, scores, next_frame_id_to_show) presenter.drawGraphs(frame) rendering_start_time = perf_counter() frame = draw_poses(frame, poses, args.prob_threshold, output_transform) render_metrics.update(rendering_start_time) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Pose estimation results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) metrics.log_total() log_latency_per_stage(cap.reader_metrics.get_latency(), hpe_pipeline.preprocess_metrics.get_latency(), hpe_pipeline.inference_metrics.get_latency(), hpe_pipeline.postprocess_metrics.get_latency(), render_metrics.get_latency()) for rep in presenter.reportMeans(): log.info(rep)
def main(): parser = argparse.ArgumentParser(description='Whiteboard inpainting demo') parser.add_argument('-i', type=str, help='Input sources (index of camera \ or path to a video file)', required=True) parser.add_argument('-m_i', '--m_instance_segmentation', type=str, required=False, help='Path to the instance segmentation model') parser.add_argument('-m_s', '--m_semantic_segmentation', type=str, required=False, help='Path to the semantic segmentation model') parser.add_argument('-t', '--threshold', type=float, default=0.6, help='Threshold for person instance segmentation model') parser.add_argument('--output_video', type=str, default='', required=False, help='Optional. Path to output video') parser.add_argument("--no_show", help="Optional. Don't show output", action='store_true') parser.add_argument('-d', '--device', type=str, default='CPU', help='Optional. Specify a target device to infer on. CPU, GPU, FPGA, HDDL or MYRIAD is ' 'acceptable. The demo will look for a suitable plugin for the device specified') parser.add_argument('-l', '--cpu_extension', type=str, default=None, help='MKLDNN (CPU)-targeted custom layers.Absolute \ path to a shared library with the kernels impl.') parser.add_argument('-u', '--utilization_monitors', default='', type=str, help='Optional. List of monitors to show initially') args = parser.parse_args() capture = VideoCapture(args.i) if bool(args.m_instance_segmentation) == bool(args.m_semantic_segmentation): raise ValueError('Set up exactly one of segmentation models: '\ '--m_instance_segmentation or --m_semantic_segmentation') frame_size, fps = capture.get_source_parameters() out_frame_size = (int(frame_size[0]), int(frame_size[1] * 2)) presenter = monitors.Presenter(args.utilization_monitors, 20, (out_frame_size[0] // 4, out_frame_size[1] // 16)) root_dir = osp.dirname(osp.abspath(__file__)) mouse = MouseClick() if not args.no_show: cv2.namedWindow(WINNAME) cv2.setMouseCallback(WINNAME, mouse.get_points) if args.output_video: fourcc = cv2.VideoWriter_fourcc(*'XVID') output_video = cv2.VideoWriter(args.output_video, fourcc, fps, out_frame_size) else: output_video = None log.info("Initializing Inference Engine") ie = IECore() if args.m_instance_segmentation: labels_file = osp.join(root_dir, 'coco_labels.txt') segmentation = MaskRCNN(ie, args.m_instance_segmentation, labels_file, args.threshold, args.device, args.cpu_extension) elif args.m_semantic_segmentation: labels_file = osp.join(root_dir, 'cityscapes_labels.txt') segmentation = SemanticSegmentation(ie, args.m_semantic_segmentation, labels_file, args.threshold, args.device, args.cpu_extension) black_board = False output_frame = np.full((frame_size[1], frame_size[0], 3), 255, dtype='uint8') frame_number = 0 key = -1 while True: start = time.time() _, frame = capture.get_frame() mask = None if frame is not None: detections = segmentation.get_detections([frame]) expand_mask(detections, frame_size[0] // 27) if len(detections[0]) > 0: mask = detections[0][0][2] for i in range(1, len(detections[0])): mask = cv2.bitwise_or(mask, detections[0][i][2]) else: break if mask is not None: mask = np.stack([mask, mask, mask], axis=-1) else: mask = np.zeros(frame.shape, dtype='uint8') clear_frame = remove_background(frame, invert_colors=not black_board) output_frame = np.where(mask, output_frame, clear_frame) merged_frame = np.vstack([frame, output_frame]) merged_frame = cv2.resize(merged_frame, out_frame_size) if output_video is not None: output_video.write(merged_frame) presenter.drawGraphs(merged_frame) if not args.no_show: cv2.imshow(WINNAME, merged_frame) key = check_pressed_keys(key) if key == 27: # 'Esc' break if key == ord('i'): # catch pressing of key 'i' black_board = not black_board output_frame = 255 - output_frame else: presenter.handleKey(key) if mouse.crop_available: x0, x1 = min(mouse.points[0][0], mouse.points[1][0]), \ max(mouse.points[0][0], mouse.points[1][0]) y0, y1 = min(mouse.points[0][1], mouse.points[1][1]), \ max(mouse.points[0][1], mouse.points[1][1]) x1, y1 = min(x1, output_frame.shape[1] - 1), min(y1, output_frame.shape[0] - 1) board = output_frame[y0: y1, x0: x1, :] if board.shape[0] > 0 and board.shape[1] > 0: cv2.namedWindow('Board', cv2.WINDOW_KEEPRATIO) cv2.imshow('Board', board) end = time.time() print('\rProcessing frame: {}, fps = {:.3}' \ .format(frame_number, 1. / (end - start)), end="") frame_number += 1 print('') log.info(presenter.reportMeans()) if output_video is not None: output_video.release()
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() # Plugin initialization for specified device and load extensions library if specified. log.info('Creating Inference Engine...') ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Loading network') net = ie.read_network(args.model, os.path.splitext(args.model)[0] + '.bin') image_input, image_info_input, (n, c, h, w), postprocessor = check_model(net) log.info('Loading IR to the plugin...') exec_net = ie.load_network(network=net, device_name=args.device, num_requests=2) try: input_source = int(args.input_source) except ValueError: input_source = args.input_source cap = cv2.VideoCapture(input_source) if not cap.isOpened(): log.error('Failed to open "{}"'.format(args.input_source)) cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) if args.no_track: tracker = None else: tracker = StaticIOUTracker() with open(args.labels, 'rt') as labels_file: class_labels = labels_file.read().splitlines() presenter = monitors.Presenter(args.utilization_monitors, 45, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8))) visualizer = Visualizer(class_labels, show_boxes=args.show_boxes, show_scores=args.show_scores) render_time = 0 log.info('Starting inference...') print("To close the application, press 'CTRL+C' here or switch to the output window and press ESC key") while cap.isOpened(): ret, frame = cap.read() if not ret: break if args.no_keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) input_image_info = np.asarray([[input_image_size[0], input_image_size[1], 1]], dtype=np.float32) # Run the net. inf_start = time.time() feed_dict = {image_input: input_image} if image_info_input: feed_dict[image_info_input] = input_image_info outputs = exec_net.infer(feed_dict) inf_end = time.time() det_time = inf_end - inf_start # Parse detection results of the current request scores, classes, boxes, masks = postprocessor( outputs, scale_x, scale_y, *frame.shape[:2], h, w, args.prob_threshold) render_start = time.time() if len(boxes) and args.raw_output_message: log.info('Detected boxes:') log.info(' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ') for box, cls, score, mask in zip(boxes, classes, scores, masks): log.info('{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} '.format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, presenter, masks, masks_tracks_ids) # Draw performance stats. inf_time_message = 'Inference time: {:.3f} ms'.format(det_time * 1000) render_time_message = 'OpenCV rendering time: {:.3f} ms'.format(render_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) # Print performance counters. if args.perf_counts: perf_counts = exec_net.requests[0].get_perf_counts() log.info('Performance counters:') print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format('name', 'layer_type', 'exet_type', 'status', 'real_time, us')) for layer, stats in perf_counts.items(): print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format(layer, stats['layer_type'], stats['exec_type'], stats['status'], stats['real_time'])) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) render_end = time.time() render_time = render_end - render_start if not args.no_show: key = cv2.waitKey(args.delay) esc_code = 27 if key == esc_code: break presenter.handleKey(key) print(presenter.reportMeans()) cv2.destroyAllWindows() cap.release()
def main(): args = build_argparser().parse_args() # ------------- 1. Plugin initialization for specified device and load extensions library if specified ------------- log.info("Creating Inference Engine...") ie = IECore() config_user_specified = {} config_min_latency = {} devices_nstreams = {} if args.num_streams: devices_nstreams = {device: args.num_streams for device in ['CPU', 'GPU'] if device in args.device} \ if args.num_streams.isdigit() \ else dict([device.split(':') for device in args.num_streams.split(',')]) if 'CPU' in args.device: if args.cpu_extension: ie.add_extension(args.cpu_extension, 'CPU') if args.number_threads is not None: config_user_specified['CPU_THREADS_NUM'] = str(args.number_threads) if 'CPU' in devices_nstreams: config_user_specified['CPU_THROUGHPUT_STREAMS'] = devices_nstreams['CPU'] \ if int(devices_nstreams['CPU']) > 0 \ else 'CPU_THROUGHPUT_AUTO' config_min_latency['CPU_THROUGHPUT_STREAMS'] = '1' if 'GPU' in args.device: if 'GPU' in devices_nstreams: config_user_specified['GPU_THROUGHPUT_STREAMS'] = devices_nstreams['GPU'] \ if int(devices_nstreams['GPU']) > 0 \ else 'GPU_THROUGHPUT_AUTO' config_min_latency['GPU_THROUGHPUT_STREAMS'] = '1' # -------------------- 2. Reading the IR generated by the Model Optimizer (.xml and .bin files) -------------------- log.info("Loading network") net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin") # ---------------------------------- 3. Load CPU extension for support specific layer ------------------------------ if "CPU" in args.device: supported_layers = ie.query_network(net, "CPU") not_supported_layers = [ l for l in net.layers.keys() if l not in supported_layers ] if len(not_supported_layers) != 0: log.error( "Following layers are not supported by the plugin for specified device {}:\n {}" .format(args.device, ', '.join(not_supported_layers))) log.error( "Please try to specify cpu extensions library path in sample's command line parameters using -l " "or --cpu_extension command line argument") sys.exit(1) assert len( net.input_info ) == 1, "Sample supports only YOLO V3 based single input topologies" # ---------------------------------------------- 4. Preparing inputs ----------------------------------------------- log.info("Preparing inputs") input_blob = next(iter(net.input_info)) # Read and pre-process input images if net.input_info[input_blob].input_data.shape[1] == 3: input_height, input_width = net.input_info[ input_blob].input_data.shape[2:] nchw_shape = True else: input_height, input_width = net.input_info[ input_blob].input_data.shape[1:3] nchw_shape = False if args.labels: with open(args.labels, 'r') as f: labels_map = [x.strip() for x in f] else: labels_map = None input_stream = 0 if args.input == "cam" else args.input mode = Mode(Modes.USER_SPECIFIED) cap = cv2.VideoCapture(input_stream) wait_key_time = 1 # ----------------------------------------- 5. Loading model to the plugin ----------------------------------------- log.info("Loading model to the plugin") exec_nets = {} exec_nets[Modes.USER_SPECIFIED] = ie.load_network( network=net, device_name=args.device, config=config_user_specified, num_requests=args.num_infer_requests) exec_nets[Modes.MIN_LATENCY] = ie.load_network( network=net, device_name=args.device.split(":")[-1].split(",")[0], config=config_min_latency, num_requests=1) empty_requests = deque(exec_nets[mode.current].requests) completed_request_results = {} next_frame_id = 0 next_frame_id_to_show = 0 mode_info = {mode.current: ModeInfo()} event = threading.Event() callback_exceptions = [] # ----------------------------------------------- 6. Doing inference ----------------------------------------------- log.info("Starting inference...") print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) print( "To switch between min_latency/user_specified modes, press TAB key in the output window" ) presenter = monitors.Presenter( args.utilization_monitors, 55, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH) / 4), round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) / 8))) while (cap.isOpened() \ or completed_request_results \ or len(empty_requests) < len(exec_nets[mode.current].requests)) \ and not callback_exceptions: if next_frame_id_to_show in completed_request_results: frame, output, start_time, is_same_mode = completed_request_results.pop( next_frame_id_to_show) next_frame_id_to_show += 1 if is_same_mode: mode_info[mode.current].frames_count += 1 objects = get_objects(output, net, (input_height, input_width), frame.shape[:-1], args.prob_threshold, args.keep_aspect_ratio) objects = filter_objects(objects, args.iou_threshold, args.prob_threshold) if len(objects) and args.raw_output_message: log.info( " Class ID | Confidence | XMIN | YMIN | XMAX | YMAX | COLOR " ) origin_im_size = frame.shape[:-1] presenter.drawGraphs(frame) for obj in objects: # Validation bbox of detected object obj['xmax'] = min(obj['xmax'], origin_im_size[1]) obj['ymax'] = min(obj['ymax'], origin_im_size[0]) obj['xmin'] = max(obj['xmin'], 0) obj['ymin'] = max(obj['ymin'], 0) color = (min(obj['class_id'] * 12.5, 255), min(obj['class_id'] * 7, 255), min(obj['class_id'] * 5, 255)) det_label = labels_map[obj['class_id']] if labels_map and len(labels_map) >= obj['class_id'] else \ str(obj['class_id']) if args.raw_output_message: log.info( "{:^9} | {:10f} | {:4} | {:4} | {:4} | {:4} | {} ". format(det_label, obj['confidence'], obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], color)) cv2.rectangle(frame, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), color, 2) cv2.putText( frame, "#" + det_label + ' ' + str(round(obj['confidence'] * 100, 1)) + ' %', (obj['xmin'], obj['ymin'] - 7), cv2.FONT_HERSHEY_COMPLEX, 0.6, color, 1) # Draw performance stats over frame if mode_info[mode.current].frames_count != 0: fps_message = "FPS: {:.1f}".format(mode_info[mode.current].frames_count / \ (perf_counter() - mode_info[mode.current].last_start_time)) mode_info[ mode.current].latency_sum += perf_counter() - start_time latency_message = "Latency: {:.1f} ms".format((mode_info[mode.current].latency_sum / \ mode_info[mode.current].frames_count) * 1e3) put_highlighted_text(frame, fps_message, (15, 20), cv2.FONT_HERSHEY_COMPLEX, 0.75, (200, 10, 10), 2) put_highlighted_text(frame, latency_message, (15, 50), cv2.FONT_HERSHEY_COMPLEX, 0.75, (200, 10, 10), 2) mode_message = "{} mode".format(mode.current.name) put_highlighted_text(frame, mode_message, (10, int(origin_im_size[0] - 20)), cv2.FONT_HERSHEY_COMPLEX, 0.75, (10, 10, 200), 2) if not args.no_show: cv2.imshow("Detection Results", frame) #print(frame.shape) cv2.imwrite("prob_threshold-0.2.png", frame) key = cv2.waitKey(wait_key_time) if key in {ord("q"), ord("Q"), 27}: # ESC key break if key == 9: # Tab key prev_mode = mode.current mode.next() await_requests_completion(exec_nets[prev_mode].requests) empty_requests.clear() empty_requests.extend(exec_nets[mode.current].requests) mode_info[prev_mode].last_end_time = perf_counter() mode_info[mode.current] = ModeInfo() else: presenter.handleKey(key) elif empty_requests and cap.isOpened(): start_time = perf_counter() ret, frame = cap.read() if not ret: if args.loop_input: cap.open(input_stream) else: cap.release() continue request = empty_requests.popleft() # resize input_frame to network size in_frame = preprocess_frame(frame, input_height, input_width, nchw_shape, args.keep_aspect_ratio) # Start inference request.set_completion_callback( py_callback=async_callback, py_data=(request, next_frame_id, mode.current, frame, start_time, completed_request_results, empty_requests, mode, event, callback_exceptions)) request.async_infer(inputs={input_blob: in_frame}) next_frame_id += 1 else: event.wait() if callback_exceptions: raise callback_exceptions[0] for mode_value in mode_info.keys(): log.info("") log.info("Mode: {}".format(mode_value.name)) end_time = mode_info[mode_value].last_end_time if mode_value in mode_info \ and mode_info[mode_value].last_end_time is not None \ else perf_counter() log.info("FPS: {:.1f}".format(mode_info[mode_value].frames_count / \ (end_time - mode_info[mode_value].last_start_time))) log.info("Latency: {:.1f} ms".format((mode_info[mode_value].latency_sum / \ mode_info[mode_value].frames_count) * 1e3)) print(presenter.reportMeans()) for exec_net in exec_nets.values(): await_requests_completion(exec_net.requests)
def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) place_recognition = PlaceRecognition(args.model, args.device, args.gallery_folder, args.gallery_size) compute_embeddings_times = [] search_in_gallery_times = [] frames_processed = 0 presenter = monitors.Presenter(args.utilization_monitors, 0) video_writer = cv2.VideoWriter() metrics = PerformanceMetrics() while True: start_time = perf_counter() frame = cap.read() if frame is None: if frames_processed == 0: raise ValueError("Can't read an image from the input") break elapsed, probe_embedding = time_elapsed( place_recognition.compute_embedding, frame) compute_embeddings_times.append(elapsed) elapsed, (sorted_indexes, distances) = time_elapsed( place_recognition.search_in_gallery, probe_embedding) search_in_gallery_times.append(elapsed) image, key = visualize( frame, [str(place_recognition.impaths[i]) for i in sorted_indexes], distances[sorted_indexes], place_recognition.input_size, np.mean(compute_embeddings_times), np.mean(search_in_gallery_times), imshow_delay=3, presenter=presenter, no_show=args.no_show) metrics.update(start_time) if frames_processed == 0: if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (image.shape[1], image.shape[0])): raise RuntimeError("Can't open video writer") frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(image) if key == 27: break metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)
video_writer = cv2.VideoWriter() if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") base_height = args.height_size fx = args.fx frames_processed = 0 delay = 1 esc_code = 27 p_code = 112 space_code = 32 mean_time = 0 presenter = monitors.Presenter(args.utilization_monitors, 0) while frame is not None: current_time = cv2.getTickCount() input_scale = base_height / frame.shape[0] scaled_img = cv2.resize(frame, dsize=None, fx=input_scale, fy=input_scale) if fx < 0: # Focal length is unknown fx = np.float32(0.8 * frame.shape[1]) inference_result = inference_engine.infer(scaled_img) poses_3d, poses_2d = parse_poses(inference_result, input_scale, stride, fx, is_video) edges = []
def main(): metrics = PerformanceMetrics() args = build_argparser().parse_args() # Plugin initialization for specified device and load extensions library if specified log.info("Creating Inference Engine") ie = IECore() # Read IR log.info("Loading network files:\n\t{}".format(args.model_pnet)) p_net = ie.read_network(args.model_pnet) assert len(p_net.input_info.keys() ) == 1, "Pnet supports only single input topologies" assert len(p_net.outputs) == 2, "Pnet supports two output topologies" log.info("Loading network files:\n\t{}".format(args.model_rnet)) r_net = ie.read_network(args.model_rnet) assert len(r_net.input_info.keys() ) == 1, "Rnet supports only single input topologies" assert len(r_net.outputs) == 2, "Rnet supports two output topologies" log.info("Loading network files:\n\t{}".format(args.model_onet)) o_net = ie.read_network(args.model_onet) assert len(o_net.input_info.keys() ) == 1, "Onet supports only single input topologies" assert len(o_net.outputs) == 3, "Onet supports three output topologies" log.info("Preparing input blobs") pnet_input_blob = next(iter(p_net.input_info)) rnet_input_blob = next(iter(r_net.input_info)) onet_input_blob = next(iter(o_net.input_info)) log.info("Preparing output blobs") for name, blob in p_net.outputs.items(): if blob.shape[1] == 2: pnet_cls_name = name elif blob.shape[1] == 4: pnet_roi_name = name else: raise RuntimeError("Unsupported output layer for Pnet") for name, blob in r_net.outputs.items(): if blob.shape[1] == 2: rnet_cls_name = name elif blob.shape[1] == 4: rnet_roi_name = name else: raise RuntimeError("Unsupported output layer for Rnet") for name, blob in o_net.outputs.items(): if blob.shape[1] == 2: onet_cls_name = name elif blob.shape[1] == 4: onet_roi_name = name elif blob.shape[1] == 10: onet_pts_name = name else: raise RuntimeError("Unsupported output layer for Onet") cap = open_images_capture(args.input, args.loop) next_frame_id = 0 log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) presenter = None video_writer = cv2.VideoWriter() while True: start_time = perf_counter() origin_image = cap.read() if origin_image is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: presenter = monitors.Presenter(args.utilization_monitors, 55, (round(origin_image.shape[1] / 4), round(origin_image.shape[0] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (origin_image.shape[1], origin_image.shape[0])): raise RuntimeError("Can't open video writer") next_frame_id += 1 rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB) oh, ow, _ = rgb_image.shape scales = utils.calculate_scales(rgb_image) # ************************************* # Pnet stage # ************************************* log.info("Loading Pnet model to the plugin") t0 = cv2.getTickCount() pnet_res = [] for scale in scales: hs = int(oh * scale) ws = int(ow * scale) image = preprocess_image(rgb_image, ws, hs) p_net.reshape({pnet_input_blob: [1, 3, ws, hs]}) # Change weidth and height of input blob exec_pnet = ie.load_network(network=p_net, device_name=args.device) p_res = exec_pnet.infer(inputs={pnet_input_blob: image}) pnet_res.append(p_res) image_num = len(scales) rectangles = [] for i in range(image_num): roi = pnet_res[i][pnet_roi_name] cls = pnet_res[i][pnet_cls_name] _, _, out_h, out_w = cls.shape out_side = max(out_h, out_w) rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side, 1 / scales[i], ow, oh, score_threshold[0], iou_threshold[0]) rectangles.extend(rectangle) rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou') # Rnet stage if len(rectangles) > 0: log.info("Loading Rnet model to the plugin") r_net.reshape({rnet_input_blob: [len(rectangles), 3, 24, 24]}) # Change batch size of input blob exec_rnet = ie.load_network(network=r_net, device_name=args.device) rnet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 24, 24) rnet_input.extend(crop_img) rnet_res = exec_rnet.infer(inputs={rnet_input_blob: rnet_input}) roi = rnet_res[rnet_roi_name] cls = rnet_res[rnet_cls_name] rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh, score_threshold[1], iou_threshold[2]) # Onet stage if len(rectangles) > 0: log.info("Loading Onet model to the plugin") o_net.reshape({onet_input_blob: [len(rectangles), 3, 48, 48]}) # Change batch size of input blob exec_onet = ie.load_network(network=o_net, device_name=args.device) onet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 48, 48) onet_input.extend(crop_img) onet_res = exec_onet.infer(inputs={onet_input_blob: onet_input}) roi = onet_res[onet_roi_name] cls = onet_res[onet_cls_name] pts = onet_res[onet_pts_name] rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow, oh, score_threshold[2], iou_threshold[3]) # display results for rectangle in rectangles: # Draw detected boxes cv2.putText(origin_image, 'confidence: {:.2f}'.format(rectangle[4]), (int(rectangle[0]), int(rectangle[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0)) cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])), (int(rectangle[2]), int(rectangle[3])), (255, 0, 0), 1) # Draw landmarks for i in range(5, 15, 2): cv2.circle(origin_image, (int(rectangle[i + 0]), int(rectangle[i + 1])), 2, (0, 255, 0)) infer_time = (cv2.getTickCount() - t0) / cv2.getTickFrequency() # Record infer time cv2.putText(origin_image, 'summary: {:.1f} FPS'.format(1.0 / infer_time), (5, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 200)) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id <= args.output_limit - 1): video_writer.write(origin_image) if not args.no_show: cv2.imshow('MTCNN Results', origin_image) key = cv2.waitKey(1) if key in {ord('q'), ord('Q'), 27}: break presenter.handleKey(key) metrics.update(start_time, origin_image) metrics.print_total()
def main(): args = build_argparser().parse_args() metrics = PerformanceMetrics() log.info('Initializing Inference Engine...') ie = IECore() plugin_config = get_plugin_configs(args.device, args.num_streams, args.num_threads) cap = open_images_capture(args.input, args.loop) start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") log.info('Loading network...') model = get_model(ie, args, frame.shape[1] / frame.shape[0]) hpe_pipeline = AsyncPipeline(ie, model, plugin_config, device=args.device, max_num_requests=args.num_infer_requests) log.info('Starting inference...') hpe_pipeline.submit_data(frame, 0, { 'frame': frame, 'start_time': start_time }) next_frame_id = 1 next_frame_id_to_show = 0 presenter = monitors.Presenter( args.utilization_monitors, 55, (round(frame.shape[1] / 4), round(frame.shape[0] / 8))) video_writer = cv2.VideoWriter() if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) while True: if hpe_pipeline.callback_exceptions: raise hpe_pipeline.callback_exceptions[0] # Process all completed requests results = hpe_pipeline.get_result(next_frame_id_to_show) if results: (poses, scores), frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(poses) and args.raw_output_message: print_raw_results(poses, scores) presenter.drawGraphs(frame) frame = draw_poses(frame, poses, args.prob_threshold) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Pose estimation results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) next_frame_id_to_show += 1 continue if hpe_pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: break # Submit for inference hpe_pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request hpe_pipeline.await_any() hpe_pipeline.await_all() # Process completed requests while hpe_pipeline.has_completed_request(): results = hpe_pipeline.get_result(next_frame_id_to_show) if results: (poses, scores), frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] if len(poses) and args.raw_output_message: print_raw_results(poses, scores) presenter.drawGraphs(frame) frame = draw_poses(frame, poses, args.prob_threshold) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Pose estimation results', frame) key = cv2.waitKey(1) ESC_KEY = 27 # Quit. if key in {ord('q'), ord('Q'), ESC_KEY}: break presenter.handleKey(key) next_frame_id_to_show += 1 else: break metrics.print_total() print(presenter.reportMeans())
input_source = int(args.input) except ValueError: input_source = args.input cap = cv.VideoCapture(input_source) if not cap.isOpened(): assert "{} not exist".format(input_source) color_coeff = np.load(coeffs).astype(np.float32) assert color_coeff.shape == ( 313, 2), "Current shape of color coefficients does not match required shape" imshowSize = (640, 480) graphSize = (imshowSize[0] // 2, imshowSize[1] // 4) presenter = monitors.Presenter(args.utilization_monitors, imshowSize[1] * 2 - graphSize[1], graphSize) while True: log.debug("#############################") hasFrame, original_frame = cap.read() if not hasFrame: break (h_orig, w_orig) = original_frame.shape[:2] log.debug("Preprocessing frame") if original_frame.shape[2] > 1: frame = cv.cvtColor(cv.cvtColor(original_frame, cv.COLOR_BGR2GRAY), cv.COLOR_GRAY2RGB) else: frame = cv.cvtColor(original_frame, cv.COLOR_GRAY2RGB)
def run_demo(args): ie = IECore() detector_person = Detector(ie, path_to_model_xml=args.model_od, device=args.device, label_class=args.person_label) single_human_pose_estimator = HumanPoseEstimator( ie, path_to_model_xml=args.model_hpe, device=args.device) cap = open_images_capture(args.input, args.loop) frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") delay = int(cap.get_type() in ('VIDEO', 'CAMERA')) video_writer = cv2.VideoWriter() if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") frames_processed = 0 presenter = monitors.Presenter(args.utilization_monitors, 25) while frame is not None: bboxes = detector_person.detect(frame) human_poses = [ single_human_pose_estimator.estimate(frame, bbox) for bbox in bboxes ] presenter.drawGraphs(frame) colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0)] for pose, bbox in zip(human_poses, bboxes): cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (255, 0, 0), 2) for id_kpt, kpt in enumerate(pose): cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 3, colors[id_kpt], -1) cv2.putText( frame, 'summary: {:.1f} FPS (estimation: {:.1f} FPS / detection: {:.1f} FPS)' .format( float(1 / (detector_person.infer_time + single_human_pose_estimator.infer_time * len(human_poses))), float(1 / single_human_pose_estimator.infer_time), float(1 / detector_person.infer_time)), (5, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 200)) frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(frame) if not args.no_show: cv2.imshow('Human Pose Estimation Demo', frame) key = cv2.waitKey(delay) if key == 27: break presenter.handleKey(key) frame = cap.read() print(presenter.reportMeans())
def main(): args = build_argparser().parse_args() if args.labels: with open(args.labels) as f: labels = [line.strip() for line in f] else: labels = None log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() if 'MYRIAD' in args.device: myriad_config = {'MYRIAD_ENABLE_HW_ACCELERATION': 'YES'} core.set_property('MYRIAD', myriad_config) decoder_target_device = 'CPU' if args.device != 'CPU': encoder_target_device = args.device else: encoder_target_device = decoder_target_device models = [ IEModel(args.m_encoder, core, encoder_target_device, model_type='Action Recognition Encoder', num_requests=(3 if args.device == 'MYRIAD' else 1)) ] if args.architecture_type == 'en-de': if args.m_decoder is None: raise RuntimeError( 'No decoder for encoder-decoder model type (-m_de) provided') models.append( IEModel(args.m_decoder, core, decoder_target_device, model_type='Action Recognition Decoder', num_requests=2)) seq_size = models[1].input_shape[1] elif args.architecture_type == 'en-mean': models.append(DummyDecoder(num_requests=2)) seq_size = args.decoder_seq_size elif args.architecture_type == 'i3d-rgb': seq_size = models[0].input_shape[1] presenter = monitors.Presenter(args.utilization_monitors, 70) result_presenter = ResultRenderer( no_show=args.no_show, presenter=presenter, output=args.output, limit=args.output_limit, labels=labels, label_smoothing_window=args.label_smoothing) cap = open_images_capture(args.input, args.loop) run_pipeline(cap, args.architecture_type, models, result_presenter.render_frame, args.raw_output_message, seq_size=seq_size, fps=cap.fps()) for rep in presenter.reportMeans(): log.info(rep)
def main(): metrics = PerformanceMetrics() args = build_argparser().parse_args() log.info('Initializing Inference Engine...') ie = IECore() plugin_config = get_plugin_configs(args.device, args.num_streams, args.num_threads) log.info('Loading network...') model = SegmentationModel(ie, args.model) pipeline = AsyncPipeline(ie, model, plugin_config, device=args.device, max_num_requests=args.num_infer_requests) cap = open_images_capture(args.input, args.loop) next_frame_id = 0 next_frame_id_to_show = 0 log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) visualizer = Visualizer(args.colors) presenter = None video_writer = cv2.VideoWriter() while True: if pipeline.is_ready(): # Get new image/frame start_time = perf_counter() frame = cap.read() if frame is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: presenter = monitors.Presenter( args.utilization_monitors, 55, (round(frame.shape[1] / 4), round(frame.shape[0] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") # Submit for inference pipeline.submit_data(frame, next_frame_id, { 'frame': frame, 'start_time': start_time }) next_frame_id += 1 else: # Wait for empty request pipeline.await_any() if pipeline.callback_exceptions: raise pipeline.callback_exceptions[0] # Process all completed requests results = pipeline.get_result(next_frame_id_to_show) if results: objects, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] frame = visualizer.overlay_masks(frame, objects) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Segmentation Results', frame) key = cv2.waitKey(1) if key == 27 or key == 'q' or key == 'Q': break presenter.handleKey(key) next_frame_id_to_show += 1 pipeline.await_all() # Process completed requests while pipeline.has_completed_request(): results = pipeline.get_result(next_frame_id_to_show) if results: objects, frame_meta = results frame = frame_meta['frame'] start_time = frame_meta['start_time'] frame = visualizer.overlay_masks(frame, objects) presenter.drawGraphs(frame) metrics.update(start_time, frame) if video_writer.isOpened() and ( args.output_limit <= 0 or next_frame_id_to_show <= args.output_limit - 1): video_writer.write(frame) if not args.no_show: cv2.imshow('Segmentation Results', frame) key = cv2.waitKey(1) next_frame_id_to_show += 1 else: break metrics.print_total() print(presenter.reportMeans())
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() # Plugin initialization for specified device and load extensions library if specified. log.info('Creating Inference Engine...') ie = IECore() if args.cpu_extension and 'CPU' in args.device: ie.add_extension(args.cpu_extension, 'CPU') # Read IR log.info('Loading Mask-RCNN network') mask_rcnn_net = ie.read_network( args.mask_rcnn_model, os.path.splitext(args.mask_rcnn_model)[0] + '.bin') log.info('Loading encoder part of text recognition network') text_enc_net = ie.read_network( args.text_enc_model, os.path.splitext(args.text_enc_model)[0] + '.bin') log.info('Loading decoder part of text recognition network') text_dec_net = ie.read_network( args.text_dec_model, os.path.splitext(args.text_dec_model)[0] + '.bin') model_required_inputs = {'image'} old_model_required_inputs = {'im_data', 'im_info'} if set(mask_rcnn_net.input_info) == model_required_inputs: old_model = False required_output_keys = {'boxes', 'labels', 'masks', 'text_features.0'} n, c, h, w = mask_rcnn_net.input_info['image'].input_data.shape elif set(mask_rcnn_net.input_info) == old_model_required_inputs: old_model = True required_output_keys = { 'boxes', 'scores', 'classes', 'raw_masks', 'text_features' } n, c, h, w = mask_rcnn_net.input_info['im_data'].input_data.shape args.alphabet = ' 0123456789abcdefghijklmnopqrstuvwxyz' args.tr_threshold = 0 else: raise RuntimeError( 'Demo supports only topologies with the following input keys: ' f'{model_required_inputs} or {old_model_required_inputs}.') assert required_output_keys.issubset(mask_rcnn_net.outputs.keys()), \ f'Demo supports only topologies with the following output keys: {required_output_keys}' \ f'Found: {mask_rcnn_net.outputs.keys()}.' assert n == 1, 'Only batch 1 is supported by the demo application' log.info('Loading IR to the plugin...') mask_rcnn_exec_net = ie.load_network(network=mask_rcnn_net, device_name=args.device, num_requests=2) text_enc_exec_net = ie.load_network(network=text_enc_net, device_name=args.device) text_dec_exec_net = ie.load_network(network=text_dec_net, device_name=args.device) hidden_shape = text_dec_net.input_info[ args.trd_input_prev_hidden].input_data.shape del mask_rcnn_net del text_enc_net del text_dec_net input_source = args.input_source if os.path.isdir(input_source): cap = FolderCapture(input_source) else: try: input_source = int(args.input_source) cap = cv2.VideoCapture(input_source) cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) except ValueError: cap = cv2.VideoCapture(input_source) if not cap.isOpened(): raise RuntimeError('Failed to open "{}"'.format(input_source)) ret, frame = cap.read() if not ret: raise RuntimeError("Can't read an image from the input") if args.no_track: tracker = None else: tracker = StaticIOUTracker() visualizer = Visualizer(['__background__', 'text'], show_boxes=args.show_boxes, show_scores=args.show_scores) render_time = 0 presenter = monitors.Presenter(args.utilization_monitors, 45, (frame.shape[1] // 4, frame.shape[0] // 8)) log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) while ret: if not args.keep_aspect_ratio: # Resize the image to a target size. scale_x = w / frame.shape[1] scale_y = h / frame.shape[0] input_image = cv2.resize(frame, (w, h)) else: # Resize the image to keep the same aspect ratio and to fit it to a window of a target size. scale_x = scale_y = min(h / frame.shape[0], w / frame.shape[1]) input_image = cv2.resize(frame, None, fx=scale_x, fy=scale_y) input_image_size = input_image.shape[:2] input_image = np.pad(input_image, ((0, h - input_image_size[0]), (0, w - input_image_size[1]), (0, 0)), mode='constant', constant_values=0) # Change data layout from HWC to CHW. input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)).astype(np.float32) input_image_info = np.asarray( [[input_image_size[0], input_image_size[1], 1]], dtype=np.float32) # Run the net. inf_start = time.time() if old_model: outputs = mask_rcnn_exec_net.infer({ 'im_data': input_image, 'im_info': input_image_info }) else: outputs = mask_rcnn_exec_net.infer({'image': input_image}) # Parse detection results of the current request if old_model: boxes = outputs['boxes'] scores = outputs['scores'] classes = outputs['classes'].astype(np.uint32) raw_masks = outputs['raw_masks'] text_features = outputs['text_features'] else: boxes = outputs['boxes'][:, :4] scores = outputs['boxes'][:, 4] classes = outputs['labels'].astype(np.uint32) raw_masks = outputs['masks'] text_features = outputs['text_features.0'] # Filter out detections with low confidence. detections_filter = scores > args.prob_threshold scores = scores[detections_filter] classes = classes[detections_filter] boxes = boxes[detections_filter] raw_masks = raw_masks[detections_filter] text_features = text_features[detections_filter] boxes[:, 0::2] /= scale_x boxes[:, 1::2] /= scale_y masks = [] for box, cls, raw_mask in zip(boxes, classes, raw_masks): if old_model: raw_mask = raw_mask[cls, ...] mask = segm_postprocess(box, raw_mask, frame.shape[0], frame.shape[1]) masks.append(mask) texts = [] for feature in text_features: feature = text_enc_exec_net.infer({'input': feature})['output'] feature = np.reshape(feature, (feature.shape[0], feature.shape[1], -1)) feature = np.transpose(feature, (0, 2, 1)) hidden = np.zeros(hidden_shape) prev_symbol_index = np.ones((1, )) * SOS_INDEX text = '' text_confidence = 1.0 for i in range(MAX_SEQ_LEN): decoder_output = text_dec_exec_net.infer({ args.trd_input_prev_symbol: prev_symbol_index, args.trd_input_prev_hidden: hidden, args.trd_input_encoder_outputs: feature }) symbols_distr = decoder_output[args.trd_output_symbols_distr] symbols_distr_softmaxed = softmax(symbols_distr, axis=1)[0] prev_symbol_index = int(np.argmax(symbols_distr, axis=1)) text_confidence *= symbols_distr_softmaxed[prev_symbol_index] if prev_symbol_index == EOS_INDEX: break text += args.alphabet[prev_symbol_index] hidden = decoder_output[args.trd_output_cur_hidden] texts.append(text if text_confidence >= args.tr_threshold else '') inf_end = time.time() inf_time = inf_end - inf_start render_start = time.time() if len(boxes) and args.raw_output_message: log.info('Detected boxes:') log.info( ' Class ID | Confidence | XMIN | YMIN | XMAX | YMAX ' ) for box, cls, score, mask in zip(boxes, classes, scores, masks): log.info( '{:>10} | {:>10f} | {:>8.2f} | {:>8.2f} | {:>8.2f} | {:>8.2f} ' .format(cls, score, *box)) # Get instance track IDs. masks_tracks_ids = None if tracker is not None: masks_tracks_ids = tracker(masks, classes) presenter.drawGraphs(frame) # Visualize masks. frame = visualizer(frame, boxes, classes, scores, masks, texts, masks_tracks_ids) # Draw performance stats. inf_time_message = 'Inference and post-processing time: {:.3f} ms'.format( inf_time * 1000) render_time_message = 'OpenCV rendering time: {:.3f} ms'.format( render_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, render_time_message, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (10, 10, 200), 1) # Print performance counters. if args.perf_counts: perf_counts = mask_rcnn_exec_net.requests[0].get_perf_counts() log.info('Performance counters:') print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( 'name', 'layer_type', 'exet_type', 'status', 'real_time, us')) for layer, stats in perf_counts.items(): print('{:<70} {:<15} {:<15} {:<15} {:<10}'.format( layer, stats['layer_type'], stats['exec_type'], stats['status'], stats['real_time'])) if not args.no_show: # Show resulting image. cv2.imshow('Results', frame) render_end = time.time() render_time = render_end - render_start if not args.no_show: key = cv2.waitKey(args.delay) esc_code = 27 if key == esc_code: break presenter.handleKey(key) ret, frame = cap.read() print(presenter.reportMeans()) cv2.destroyAllWindows() cap.release()
def run_demo(args): cap = open_images_capture(args.input, args.loop) log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() log.info('Reading Object Detection model {}'.format(args.model_od)) detector_person = Detector(ie, args.model_od, device=args.device, label_class=args.person_label) log.info('The Object Detection model {} is loaded to {}'.format( args.model_od, args.device)) log.info('Reading Human Pose Estimation model {}'.format(args.model_hpe)) single_human_pose_estimator = HumanPoseEstimator(ie, args.model_hpe, device=args.device) log.info('The Human Pose Estimation model {} is loaded to {}'.format( args.model_hpe, args.device)) delay = int(cap.get_type() in ('VIDEO', 'CAMERA')) video_writer = cv2.VideoWriter() frames_processed = 0 presenter = monitors.Presenter(args.utilization_monitors, 25) metrics = PerformanceMetrics() start_time = perf_counter() frame = cap.read() if frame is None: raise RuntimeError("Can't read an image from the input") if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (frame.shape[1], frame.shape[0])): raise RuntimeError("Can't open video writer") while frame is not None: bboxes = detector_person.detect(frame) human_poses = [ single_human_pose_estimator.estimate(frame, bbox) for bbox in bboxes ] presenter.drawGraphs(frame) colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0), (255, 0, 0), (0, 255, 0)] for pose, bbox in zip(human_poses, bboxes): cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (255, 0, 0), 2) for id_kpt, kpt in enumerate(pose): cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 3, colors[id_kpt], -1) metrics.update(start_time, frame) frames_processed += 1 if video_writer.isOpened() and (args.output_limit <= 0 or frames_processed <= args.output_limit): video_writer.write(frame) if not args.no_show: cv2.imshow('Human Pose Estimation Demo', frame) key = cv2.waitKey(delay) if key == 27: break presenter.handleKey(key) start_time = perf_counter() frame = cap.read() metrics.log_total() for rep in presenter.reportMeans(): log.info(rep)