Пример #1
0
def main():
    log.basicConfig(format="[ %(levelname)s ] %(message)s",
                    level=log.INFO,
                    stream=sys.stdout)
    args = build_argparser().parse_args()
    log.info("Creating Inference Engine")
    ie = IECore()
    ie.set_config({"PERF_COUNT": "YES" if args.perf_counts else "NO"},
                  args.device)

    encoder = read_net(args.m_encoder, ie, args.device)
    dec_step = read_net(args.m_decoder, ie, args.device)

    batch_dim, channels, height, width = encoder.input_info[
        'imgs'].input_data.shape
    assert batch_dim == 1, "Demo only works with batch size 1."
    assert channels in (1, 3), "Input image is not 1 or 3 channeled image."
    target_shape = (height, width)
    images_list = []
    if os.path.isdir(args.input):
        inputs = sorted(
            os.path.join(args.input, inp) for inp in os.listdir(args.input))
    else:
        inputs = [args.input]
    log.info("Loading vocab file")
    vocab = Vocab(args.vocab_path)

    log.info("Loading and preprocessing images")
    for filenm in tqdm(inputs):
        image_raw = cv.imread(filenm)
        assert image_raw is not None, "Error reading image {}".format(filenm)
        image = preprocess_image(PREPROCESSING[args.preprocessing_type],
                                 image_raw, target_shape)
        record = dict(img_name=filenm, img=image, formula=None)
        images_list.append(record)

    log.info("Loading networks")
    exec_net_encoder = ie.load_network(network=encoder,
                                       device_name=args.device)
    exec_net_decoder = ie.load_network(network=dec_step,
                                       device_name=args.device)

    log.info("Starting inference")
    for rec in tqdm(images_list):
        image = rec['img']

        enc_res = exec_net_encoder.infer(inputs={args.imgs_layer: image})
        # get results
        row_enc_out = enc_res[args.row_enc_out_layer]
        dec_states_h = enc_res[args.hidden_layer]
        dec_states_c = enc_res[args.context_layer]
        output = enc_res[args.init_0_layer]

        tgt = np.array([[START_TOKEN]])
        logits = []
        for _ in range(args.max_formula_len):
            dec_res = exec_net_decoder.infer(
                inputs={
                    args.row_enc_out_layer: row_enc_out,
                    args.dec_st_c_layer: dec_states_c,
                    args.dec_st_h_layer: dec_states_h,
                    args.output_prev_layer: output,
                    args.tgt_layer: tgt
                })

            dec_states_h = dec_res[args.dec_st_h_t_layer]
            dec_states_c = dec_res[args.dec_st_c_t_layer]
            output = dec_res[args.output_layer]
            logit = dec_res[args.logit_layer]
            logits.append(logit)
            tgt = np.array([[np.argmax(logit, axis=1)]])

            if tgt[0][0][0] == END_TOKEN:
                break
        if args.perf_counts:
            log.info("Encoder performance statistics")
            print_stats(exec_net_encoder)
            log.info("Decoder performance statistics")
            print_stats(exec_net_decoder)

        logits = np.array(logits)
        logits = logits.squeeze(axis=1)
        targets = np.argmax(logits, axis=1)
        if args.output_file:
            with open(args.output_file, 'a') as output_file:
                output_file.write(rec['img_name'] + '\t' +
                                  vocab.construct_phrase(targets) + '\n')
        else:
            print("Image name: {}\nFormula: {}\n".format(
                rec['img_name'], vocab.construct_phrase(targets)))

    log.info(
        "This demo is an API example, for any performance measurements please use the dedicated benchmark_app tool "
        "from the openVINO toolkit\n")