def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() log.info("Creating Inference Engine") ie = IECore() ie.set_config({"PERF_COUNT": "YES" if args.perf_counts else "NO"}, args.device) encoder = read_net(args.m_encoder, ie, args.device) dec_step = read_net(args.m_decoder, ie, args.device) batch_dim, channels, height, width = encoder.input_info[ 'imgs'].input_data.shape assert batch_dim == 1, "Demo only works with batch size 1." assert channels in (1, 3), "Input image is not 1 or 3 channeled image." target_shape = (height, width) images_list = [] if os.path.isdir(args.input): inputs = sorted( os.path.join(args.input, inp) for inp in os.listdir(args.input)) else: inputs = [args.input] log.info("Loading vocab file") vocab = Vocab(args.vocab_path) log.info("Loading and preprocessing images") for filenm in tqdm(inputs): image_raw = cv.imread(filenm) assert image_raw is not None, "Error reading image {}".format(filenm) image = preprocess_image(PREPROCESSING[args.preprocessing_type], image_raw, target_shape) record = dict(img_name=filenm, img=image, formula=None) images_list.append(record) log.info("Loading networks") exec_net_encoder = ie.load_network(network=encoder, device_name=args.device) exec_net_decoder = ie.load_network(network=dec_step, device_name=args.device) log.info("Starting inference") for rec in tqdm(images_list): image = rec['img'] enc_res = exec_net_encoder.infer(inputs={args.imgs_layer: image}) # get results row_enc_out = enc_res[args.row_enc_out_layer] dec_states_h = enc_res[args.hidden_layer] dec_states_c = enc_res[args.context_layer] output = enc_res[args.init_0_layer] tgt = np.array([[START_TOKEN]]) logits = [] for _ in range(args.max_formula_len): dec_res = exec_net_decoder.infer( inputs={ args.row_enc_out_layer: row_enc_out, args.dec_st_c_layer: dec_states_c, args.dec_st_h_layer: dec_states_h, args.output_prev_layer: output, args.tgt_layer: tgt }) dec_states_h = dec_res[args.dec_st_h_t_layer] dec_states_c = dec_res[args.dec_st_c_t_layer] output = dec_res[args.output_layer] logit = dec_res[args.logit_layer] logits.append(logit) tgt = np.array([[np.argmax(logit, axis=1)]]) if tgt[0][0][0] == END_TOKEN: break if args.perf_counts: log.info("Encoder performance statistics") print_stats(exec_net_encoder) log.info("Decoder performance statistics") print_stats(exec_net_decoder) logits = np.array(logits) logits = logits.squeeze(axis=1) targets = np.argmax(logits, axis=1) if args.output_file: with open(args.output_file, 'a') as output_file: output_file.write(rec['img_name'] + '\t' + vocab.construct_phrase(targets) + '\n') else: print("Image name: {}\nFormula: {}\n".format( rec['img_name'], vocab.construct_phrase(targets))) log.info( "This demo is an API example, for any performance measurements please use the dedicated benchmark_app tool " "from the openVINO toolkit\n")