def main(): args = build_argparser().parse_args() # Plugin initialization log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) core = Core() if 'GPU' in args.device: core.set_property("GPU", {"GPU_ENABLE_LOOP_UNROLLING": "NO", "CACHE_DIR": "./"}) # Read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) if len(model.inputs) != 1: raise RuntimeError("Demo supports only single input topologies") input_tensor_name = model.inputs[0].get_any_name() if args.output_blob is not None: output_tensor_name = args.output_blob else: if len(model.outputs) != 1: raise RuntimeError("Demo supports only single output topologies") output_tensor_name = model.outputs[0].get_any_name() characters = get_characters(args) codec = CTCCodec(characters, args.designated_characters, args.top_k) if len(codec.characters) != model.output(output_tensor_name).shape[2]: raise RuntimeError("The text recognition model does not correspond to decoding character list") input_batch_size, input_channel, input_height, input_width = model.inputs[0].shape # Read and pre-process input image (NOTE: one image only) preprocessing_start_time = perf_counter() input_image = preprocess_input(args.input, height=input_height, width=input_width)[None, :, :, :] preprocessing_total_time = perf_counter() - preprocessing_start_time if input_batch_size != input_image.shape[0]: raise RuntimeError("The model's input batch size should equal the input image's batch size") if input_channel != input_image.shape[1]: raise RuntimeError("The model's input channel should equal the input image's channel") # Loading model to the plugin compiled_model = core.compile_model(model, args.device) infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) # Start sync inference start_time = perf_counter() for _ in range(args.number_iter): infer_request.infer(inputs={input_tensor_name: input_image}) preds = infer_request.get_tensor(output_tensor_name).data[:] result = codec.decode(preds) print(result) total_latency = ((perf_counter() - start_time) / args.number_iter + preprocessing_total_time) * 1e3 log.info("Metrics report:") log.info("\tLatency: {:.1f} ms".format(total_latency)) sys.exit()
def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() # Plugin initialization ie = IECore() # Read IR #log.info("Loading network") model = 'handwritten-japanese-recognition-0001' model = '/Users/imamura/model_2020.2/intel/' + model + '/FP32/' + model net = ie.read_network(model + '.xml', model + '.bin') #net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin") assert len( net.input_info) == 1, "Demo supports only single input topologies" assert len(net.outputs) == 1, "Demo supports only single output topologies" #log.info("Preparing input/output blobs") input_blob = next(iter(net.input_info)) out_blob = next(iter(net.outputs)) characters = get_characters(args) codec = CTCCodec(characters, args.designated_characters, args.top_k) assert len(codec.characters) == net.outputs[out_blob].shape[ 2], "The text recognition model does not correspond to decoding character list" input_batch_size, input_channel, input_height, input_width = net.input_info[ input_blob].input_data.shape # Read and pre-process input image (NOTE: one image only) input_image = preprocess_input(args.input, height=input_height, width=input_width)[None, :, :, :] assert input_batch_size == input_image.shape[ 0], "The net's input batch size should equal the input image's batch size " assert input_channel == input_image.shape[ 1], "The net's input channel should equal the input image's channel" # Loading model to the plugin #log.info("Loading model to the plugin") exec_net = ie.load_network(network=net, device_name=args.device) # Start sync inference #log.info("Starting inference ({} iterations)".format(args.number_iter)) infer_time = [] for i in range(args.number_iter): t0 = time.time() preds = exec_net.infer(inputs={input_blob: input_image}) preds = preds[out_blob] result = codec.decode(preds) print(', '.join(map(str, result))) infer_time.append((time.time() - t0) * 1000) #log.info("Average throughput: {} ms".format(np.average(np.asarray(infer_time)))) sys.exit()
def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() model_xml = args.model model_bin = os.path.splitext(model_xml)[0] + ".bin" # Plugin initialization ie = IECore() # Read IR log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) net = IENetwork(model=model_xml, weights=model_bin) assert len(net.inputs) == 1, "Demo supports only single input topologies" assert len(net.outputs) == 1, "Demo supports only single output topologies" log.info("Preparing input/output blobs") input_blob = next(iter(net.inputs)) out_blob = next(iter(net.outputs)) characters = get_characters(args) codec = CTCCodec(characters) assert len(codec.characters) == net.outputs[out_blob].shape[ 2], "The text recognition model does not correspond to decoding character list" input_batch_size, input_channel, input_height, input_width = net.inputs[ input_blob].shape # Read and pre-process input image (NOTE: one image only) input_image = preprocess_input(args.input, height=input_height, width=input_width)[None, :, :, :] assert input_batch_size == input_image.shape[ 0], "The net's input batch size should equal the input image's batch size " assert input_channel == input_image.shape[ 1], "The net's input channel should equal the input image's channel" # Loading model to the plugin log.info("Loading model to the plugin") exec_net = ie.load_network(network=net, device_name=args.device) # Start sync inference log.info("Starting inference ({} iterations)".format(args.number_iter)) infer_time = [] for i in range(args.number_iter): t0 = time.time() preds = exec_net.infer(inputs={input_blob: input_image}) preds = preds[out_blob] result = codec.decode(preds) print(result) infer_time.append((time.time() - t0) * 1000) log.info("Average throughput: {} ms".format( np.average(np.asarray(infer_time)))) sys.exit()
def main(): _H=0 _W=1 _C=2 global g_canvas global g_threshold global g_UIState global g_recogFlag global g_clickedFlag # Plugin initialization core = Core() model_root = '.' # text-detection-0003 in: (1,768,1280,3) out: model/link_logits_/add(1,192,320,16) model/segm_logits/add(1,192,320,2) model='text-detection-0003' model = os.path.join(model_root, 'intel', model, 'FP16', model) net_td = core.read_model(model+'.xml') ppp = PrePostProcessor(net_td) ppp.input().tensor().set_element_type(Type.u8).set_layout(Layout('NHWC')) ppp.input().preprocess().resize(ResizeAlgorithm.RESIZE_LINEAR) net_td = ppp.build() compiled_model_td = core.compile_model(net_td, 'CPU') ireq_td = compiled_model_td.create_infer_request() # handwritten-japanese-recognition model = 'handwritten-japanese-recognition-0001' model = os.path.join(model_root, 'intel', model, 'FP16', model) net = core.read_model(model+'.xml') input_batch_size, input_channel, input_height, input_width = list(net.input(0).get_shape()) compiled_model = core.compile_model(net, 'CPU') ireq = compiled_model.create_infer_request() characters = get_characters('data/kondate_nakayosi_char_list.txt') codec = CTCCodec(characters) clearCanvas() cv2.namedWindow('canvas') cv2.setMouseCallback('canvas', onMouse) cv2.createTrackbar('Link Threshold', 'canvas', 50, 100, onTrackbarLnk) cv2.createTrackbar('Classification Threshold', 'canvas', 15, 100, onTrackbarCls) while True: g_UIState = 0 while g_recogFlag==False: key=cv2.waitKey(100) dispCanvas() if key==27: return if key==ord(' '): break cv2.waitKey(1) g_recogFlag = False g_UIState = 1 print('text detection') tensor = np.expand_dims(g_canvas, 0) res_td = ireq_td.infer({0: tensor}) # To access to the inference result, either one of following way is OK. link = ireq_td.get_tensor('model/link_logits_/add:0').data # 'model/link_logits_/add' 1,192,320,16 segm = ireq_td.get_tensor('model/segm_logits/add:0').data # 'model/segm_logits/add' 1,192,320,2 #link = ireq_td.get_tensor(compiled_model_td.output(1)).data # 'model/link_logits_/add' 1,192,320,16 #segm = ireq_td.get_tensor(compiled_model_td.output(0)).data # 'model/segm_logits/add' 1,192,320,2 rects = postprocess(link, segm, _canvas_x, _canvas_y, g_lnk_th/100., g_cls_th/100.) canvas2 = g_canvas.copy() for i, rect_ in enumerate(rects): rect = ((rect_[0], rect_[1]), (rect_[2], rect_[3]), rect_[4]) box = cv2.boxPoints(rect).astype(np.int32) cv2.polylines(canvas2, [box], True, (255,0,0), 4) most_left_idx, most_left = topLeftPoint(box) crop = cropRotatedImage(g_canvas, box, most_left_idx) input_image = preprocess_input(crop, input_height, input_width)[None,:,:,:] res = ireq.infer({0: input_image}) preds = ireq.get_tensor(compiled_model.output(0)).data result = codec.decode(preds) print('OCR result ({}): {}'.format(i, result)) canvas2 = putJapaneseText(canvas2, most_left[0], most_left[1], result[0]) cv2.imshow('canvas', canvas2) cv2.waitKey(1) cv2.putText(canvas2, 'Hit any key, tap screen or click L-button to continue', (0, 40), cv2.FONT_HERSHEY_PLAIN, 2, (0,0,0), 2) cv2.imshow('canvas', canvas2) g_clickedFlag=False key=-1 while g_clickedFlag==False and key==-1: key=cv2.waitKey(100) return
def main(): args = build_argparser().parse_args() # Plugin initialization log.info('OpenVINO Inference Engine') log.info('\tbuild: {}'.format(get_version())) ie = IECore() ie.set_config(config={ "GPU_ENABLE_LOOP_UNROLLING": "NO", "CACHE_DIR": "./" }, device_name="GPU") # Read IR log.info('Reading model {}'.format(args.model)) net = ie.read_network(args.model, os.path.splitext(args.model)[0] + ".bin") assert len( net.input_info) == 1, "Demo supports only single input topologies" input_blob = next(iter(net.input_info)) if args.output_blob is not None: out_blob = args.output_blob else: assert len( net.outputs) == 1, "Demo supports only single output topologies" out_blob = next(iter(net.outputs)) characters = get_characters(args) codec = CTCCodec(characters, args.designated_characters, args.top_k) assert len(codec.characters) == net.outputs[out_blob].shape[ 2], "The text recognition model does not correspond to decoding character list" input_batch_size, input_channel, input_height, input_width = net.input_info[ input_blob].input_data.shape # Read and pre-process input image (NOTE: one image only) preprocessing_start_time = perf_counter() input_image = preprocess_input(args.input, height=input_height, width=input_width)[None, :, :, :] preprocessing_total_time = perf_counter() - preprocessing_start_time assert input_batch_size == input_image.shape[ 0], "The net's input batch size should equal the input image's batch size " assert input_channel == input_image.shape[ 1], "The net's input channel should equal the input image's channel" # Loading model to the plugin exec_net = ie.load_network(network=net, device_name=args.device) log.info('The model {} is loaded to {}'.format(args.model, args.device)) # Start sync inference start_time = perf_counter() for i in range(args.number_iter): preds = exec_net.infer(inputs={input_blob: input_image}) preds = preds[out_blob] result = codec.decode(preds) print(result) total_latency = ((perf_counter() - start_time) / args.number_iter + preprocessing_total_time) * 1e3 log.info("Metrics report:") log.info("\tLatency: {:.1f} ms".format(total_latency)) sys.exit()
def main(): _H=0 _W=1 _C=2 global g_canvas global g_threshold global g_UIState global g_recogFlag global g_clickedFlag # Plugin initialization ie = IECore() # text-detection-0003 in: (1,3,768,1280) out: model/link_logits_/add(1,16,192,320) model/segm_logits/add(1,2,192,320) model='text-detection-0003' model = './intel/'+model+'/FP16/'+model net_td = ie.read_network(model+'.xml', model+'.bin') input_blob_td = next(iter(net_td.inputs)) out_blob_td = next(iter(net_td.outputs)) exec_net_td = ie.load_network(net_td, 'CPU') # handwritten-japanese-recognition model = 'handwritten-japanese-recognition-0001' model = './intel/'+model+'/FP16/'+model net = ie.read_network(model+'.xml', model+'.bin') input_blob = next(iter(net.inputs)) out_blob = next(iter(net.outputs)) input_batch_size, input_channel, input_height, input_width= net.inputs[input_blob].shape exec_net = ie.load_network(net, 'CPU') characters = get_characters('data/kondate_nakayosi_char_list.txt') codec = CTCCodec(characters) clearCanvas() cv2.namedWindow('canvas') cv2.setMouseCallback('canvas', onMouse) cv2.createTrackbar('Threshold', 'canvas', 50, 100, onTrackbar) while True: g_UIState = 0 while g_recogFlag==False: dispCanvas() key=cv2.waitKey(100) if key==27: return if key==ord(' '): break g_recogFlag = False g_UIState = 1 print('text detection') img = cv2.resize(g_canvas, (_canvas_x, _canvas_y)) img = img.transpose((_C, _H, _W)) img = img.reshape((1, 3, _canvas_y, _canvas_x)) res_td = exec_net_td.infer(inputs={input_blob_td: img}) link = res_td['model/link_logits_/add'] # 1,16,192,320 segm = res_td['model/segm_logits/add' ] # 1, 2,192,320 rects = text_detection_postprocess(link, segm, (_canvas_x, _canvas_y), g_threshold/100., g_threshold/100.) print('text detection - completed') canvas2 = g_canvas.copy() for i, rect in enumerate(rects): box = cv2.boxPoints(rect).astype(np.int32) cv2.polylines(canvas2, [box], True, (255,0,0), 4) most_left_idx, most_left = topLeftPoint(box) crop = cropRotatedImage(g_canvas, box, most_left_idx) input_image = preprocess_input(crop, input_height, input_width)[None,:,:,:] preds = exec_net.infer(inputs={input_blob: input_image}) preds = preds[out_blob] result = codec.decode(preds) print('OCR result ({}): {}'.format(i, result)) canvas2 = putJapaneseText(canvas2, most_left[0], most_left[1], result[0]) cv2.imshow('canvas', canvas2) cv2.waitKey(1) cv2.putText(canvas2, 'Hit any key, tap screen or click L-button to continue', (0, 40), cv2.FONT_HERSHEY_PLAIN, 2, (0,0,0), 2) cv2.imshow('canvas', canvas2) g_clickedFlag=False key=-1 while g_clickedFlag==False and key==-1: key=cv2.waitKey(100) return