def main(): args = build_argparser().parse_args() cap = open_images_capture(args.input, args.loop) # Plugin initialization for specified device and load extensions library if specified log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) core = Core() # Read IR log.info('Reading Proposal model {}'.format(args.model_pnet)) p_net = core.read_model(args.model_pnet) if len(p_net.inputs) != 1: raise RuntimeError("Pnet supports only single input topologies") if len(p_net.outputs) != 2: raise RuntimeError("Pnet supports two output topologies") log.info('Reading Refine model {}'.format(args.model_rnet)) r_net = core.read_model(args.model_rnet) if len(r_net.inputs) != 1: raise RuntimeError("Rnet supports only single input topologies") if len(r_net.outputs) != 2: raise RuntimeError("Rnet supports two output topologies") log.info('Reading Output model {}'.format(args.model_onet)) o_net = core.read_model(args.model_onet) if len(o_net.inputs) != 1: raise RuntimeError("Onet supports only single input topologies") if len(o_net.outputs) != 3: raise RuntimeError("Onet supports three output topologies") pnet_input_tensor_name = p_net.inputs[0].get_any_name() rnet_input_tensor_name = r_net.inputs[0].get_any_name() onet_input_tensor_name = o_net.inputs[0].get_any_name() for node in p_net.outputs: if node.shape[1] == 2: pnet_cls_name = node.get_any_name() elif node.shape[1] == 4: pnet_roi_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Pnet") for node in r_net.outputs: if node.shape[1] == 2: rnet_cls_name = node.get_any_name() elif node.shape[1] == 4: rnet_roi_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Rnet") for node in o_net.outputs: if node.shape[1] == 2: onet_cls_name = node.get_any_name() elif node.shape[1] == 4: onet_roi_name = node.get_any_name() elif node.shape[1] == 10: onet_pts_name = node.get_any_name() else: raise RuntimeError("Unsupported output layer for Onet") next_frame_id = 0 metrics = PerformanceMetrics() presenter = None video_writer = cv2.VideoWriter() is_loaded_before = False while True: start_time = perf_counter() origin_image = cap.read() if origin_image is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: presenter = monitors.Presenter(args.utilization_monitors, 55, (round(origin_image.shape[1] / 4), round(origin_image.shape[0] / 8))) if args.output and not video_writer.open(args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (origin_image.shape[1], origin_image.shape[0])): raise RuntimeError("Can't open video writer") next_frame_id += 1 rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB) oh, ow, _ = rgb_image.shape scales = utils.calculate_scales(rgb_image) # ************************************* # Pnet stage # ************************************* pnet_res = [] for i, scale in enumerate(scales): hs = int(oh*scale) ws = int(ow*scale) image = preprocess_image(rgb_image, ws, hs) p_net.reshape({pnet_input_tensor_name: PartialShape([1, 3, ws, hs])}) # Change weidth and height of input blob compiled_pnet = core.compile_model(p_net, args.device) infer_request_pnet = compiled_pnet.create_infer_request() if i == 0 and not is_loaded_before: log.info("The Proposal model {} is loaded to {}".format(args.model_pnet, args.device)) infer_request_pnet.infer(inputs={pnet_input_tensor_name: image}) p_res = {name: infer_request_pnet.get_tensor(name).data[:] for name in {pnet_roi_name, pnet_cls_name}} pnet_res.append(p_res) image_num = len(scales) rectangles = [] for i in range(image_num): roi = pnet_res[i][pnet_roi_name] cls = pnet_res[i][pnet_cls_name] _, _, out_h, out_w = cls.shape out_side = max(out_h, out_w) rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side, 1/scales[i], ow, oh, score_threshold[0], iou_threshold[0]) rectangles.extend(rectangle) rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou') # Rnet stage if len(rectangles) > 0: r_net.reshape({rnet_input_tensor_name: PartialShape([len(rectangles), 3, 24, 24])}) # Change batch size of input blob compiled_rnet = core.compile_model(r_net, args.device) infer_request_rnet = compiled_rnet.create_infer_request() if not is_loaded_before: log.info("The Refine model {} is loaded to {}".format(args.model_rnet, args.device)) rnet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 24, 24) rnet_input.extend(crop_img) infer_request_rnet.infer(inputs={rnet_input_tensor_name: rnet_input}) rnet_res = {name: infer_request_rnet.get_tensor(name).data[:] for name in {rnet_roi_name, rnet_cls_name}} roi = rnet_res[rnet_roi_name] cls = rnet_res[rnet_cls_name] rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh, score_threshold[1], iou_threshold[2]) # Onet stage if len(rectangles) > 0: o_net.reshape({onet_input_tensor_name: PartialShape([len(rectangles), 3, 48, 48])}) # Change batch size of input blob compiled_onet = core.compile_model(o_net, args.device) infer_request_onet = compiled_onet.create_infer_request() if not is_loaded_before: log.info("The Output model {} is loaded to {}".format(args.model_onet, args.device)) is_loaded_before = True onet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 48, 48) onet_input.extend(crop_img) infer_request_onet.infer(inputs={onet_input_tensor_name: onet_input}) onet_res = {name: infer_request_onet.get_tensor(name).data[:] for name in {onet_roi_name, onet_cls_name, onet_pts_name}} roi = onet_res[onet_roi_name] cls = onet_res[onet_cls_name] pts = onet_res[onet_pts_name] rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow, oh, score_threshold[2], iou_threshold[3]) # display results for rectangle in rectangles: # Draw detected boxes cv2.putText(origin_image, 'confidence: {:.2f}'.format(rectangle[4]), (int(rectangle[0]), int(rectangle[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0)) cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])), (int(rectangle[2]), int(rectangle[3])), (255, 0, 0), 1) # Draw landmarks for i in range(5, 15, 2): cv2.circle(origin_image, (int(rectangle[i+0]), int(rectangle[i+1])), 2, (0, 255, 0)) metrics.update(start_time, origin_image) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id <= args.output_limit): video_writer.write(origin_image) if not args.no_show: cv2.imshow('MTCNN Results', origin_image) key = cv2.waitKey(1) if key in {ord('q'), ord('Q'), 27}: break presenter.handleKey(key) metrics.log_total()
def main(): metrics = PerformanceMetrics() args = build_argparser().parse_args() # Plugin initialization for specified device and load extensions library if specified log.info("Creating Inference Engine") ie = IECore() # Read IR log.info("Loading network files:\n\t{}".format(args.model_pnet)) p_net = ie.read_network(args.model_pnet) assert len(p_net.input_info.keys() ) == 1, "Pnet supports only single input topologies" assert len(p_net.outputs) == 2, "Pnet supports two output topologies" log.info("Loading network files:\n\t{}".format(args.model_rnet)) r_net = ie.read_network(args.model_rnet) assert len(r_net.input_info.keys() ) == 1, "Rnet supports only single input topologies" assert len(r_net.outputs) == 2, "Rnet supports two output topologies" log.info("Loading network files:\n\t{}".format(args.model_onet)) o_net = ie.read_network(args.model_onet) assert len(o_net.input_info.keys() ) == 1, "Onet supports only single input topologies" assert len(o_net.outputs) == 3, "Onet supports three output topologies" log.info("Preparing input blobs") pnet_input_blob = next(iter(p_net.input_info)) rnet_input_blob = next(iter(r_net.input_info)) onet_input_blob = next(iter(o_net.input_info)) log.info("Preparing output blobs") for name, blob in p_net.outputs.items(): if blob.shape[1] == 2: pnet_cls_name = name elif blob.shape[1] == 4: pnet_roi_name = name else: raise RuntimeError("Unsupported output layer for Pnet") for name, blob in r_net.outputs.items(): if blob.shape[1] == 2: rnet_cls_name = name elif blob.shape[1] == 4: rnet_roi_name = name else: raise RuntimeError("Unsupported output layer for Rnet") for name, blob in o_net.outputs.items(): if blob.shape[1] == 2: onet_cls_name = name elif blob.shape[1] == 4: onet_roi_name = name elif blob.shape[1] == 10: onet_pts_name = name else: raise RuntimeError("Unsupported output layer for Onet") cap = open_images_capture(args.input, args.loop) next_frame_id = 0 log.info('Starting inference...') print( "To close the application, press 'CTRL+C' here or switch to the output window and press ESC key" ) presenter = None video_writer = cv2.VideoWriter() while True: start_time = perf_counter() origin_image = cap.read() if origin_image is None: if next_frame_id == 0: raise ValueError("Can't read an image from the input") break if next_frame_id == 0: presenter = monitors.Presenter(args.utilization_monitors, 55, (round(origin_image.shape[1] / 4), round(origin_image.shape[0] / 8))) if args.output and not video_writer.open( args.output, cv2.VideoWriter_fourcc(*'MJPG'), cap.fps(), (origin_image.shape[1], origin_image.shape[0])): raise RuntimeError("Can't open video writer") next_frame_id += 1 rgb_image = cv2.cvtColor(origin_image, cv2.COLOR_BGR2RGB) oh, ow, _ = rgb_image.shape scales = utils.calculate_scales(rgb_image) # ************************************* # Pnet stage # ************************************* log.info("Loading Pnet model to the plugin") t0 = cv2.getTickCount() pnet_res = [] for scale in scales: hs = int(oh * scale) ws = int(ow * scale) image = preprocess_image(rgb_image, ws, hs) p_net.reshape({pnet_input_blob: [1, 3, ws, hs]}) # Change weidth and height of input blob exec_pnet = ie.load_network(network=p_net, device_name=args.device) p_res = exec_pnet.infer(inputs={pnet_input_blob: image}) pnet_res.append(p_res) image_num = len(scales) rectangles = [] for i in range(image_num): roi = pnet_res[i][pnet_roi_name] cls = pnet_res[i][pnet_cls_name] _, _, out_h, out_w = cls.shape out_side = max(out_h, out_w) rectangle = utils.detect_face_12net(cls[0][1], roi[0], out_side, 1 / scales[i], ow, oh, score_threshold[0], iou_threshold[0]) rectangles.extend(rectangle) rectangles = utils.NMS(rectangles, iou_threshold[1], 'iou') # Rnet stage if len(rectangles) > 0: log.info("Loading Rnet model to the plugin") r_net.reshape({rnet_input_blob: [len(rectangles), 3, 24, 24]}) # Change batch size of input blob exec_rnet = ie.load_network(network=r_net, device_name=args.device) rnet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 24, 24) rnet_input.extend(crop_img) rnet_res = exec_rnet.infer(inputs={rnet_input_blob: rnet_input}) roi = rnet_res[rnet_roi_name] cls = rnet_res[rnet_cls_name] rectangles = utils.filter_face_24net(cls, roi, rectangles, ow, oh, score_threshold[1], iou_threshold[2]) # Onet stage if len(rectangles) > 0: log.info("Loading Onet model to the plugin") o_net.reshape({onet_input_blob: [len(rectangles), 3, 48, 48]}) # Change batch size of input blob exec_onet = ie.load_network(network=o_net, device_name=args.device) onet_input = [] for rectangle in rectangles: crop_img = rgb_image[int(rectangle[1]):int(rectangle[3]), int(rectangle[0]):int(rectangle[2])] crop_img = preprocess_image(crop_img, 48, 48) onet_input.extend(crop_img) onet_res = exec_onet.infer(inputs={onet_input_blob: onet_input}) roi = onet_res[onet_roi_name] cls = onet_res[onet_cls_name] pts = onet_res[onet_pts_name] rectangles = utils.filter_face_48net(cls, roi, pts, rectangles, ow, oh, score_threshold[2], iou_threshold[3]) # display results for rectangle in rectangles: # Draw detected boxes cv2.putText(origin_image, 'confidence: {:.2f}'.format(rectangle[4]), (int(rectangle[0]), int(rectangle[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0)) cv2.rectangle(origin_image, (int(rectangle[0]), int(rectangle[1])), (int(rectangle[2]), int(rectangle[3])), (255, 0, 0), 1) # Draw landmarks for i in range(5, 15, 2): cv2.circle(origin_image, (int(rectangle[i + 0]), int(rectangle[i + 1])), 2, (0, 255, 0)) infer_time = (cv2.getTickCount() - t0) / cv2.getTickFrequency() # Record infer time cv2.putText(origin_image, 'summary: {:.1f} FPS'.format(1.0 / infer_time), (5, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 200)) if video_writer.isOpened() and (args.output_limit <= 0 or next_frame_id <= args.output_limit - 1): video_writer.write(origin_image) if not args.no_show: cv2.imshow('MTCNN Results', origin_image) key = cv2.waitKey(1) if key in {ord('q'), ord('Q'), 27}: break presenter.handleKey(key) metrics.update(start_time, origin_image) metrics.print_total()