Exemplo n.º 1
0
    def load_model(self, argv=None):
        os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        try:
            os.makedirs(FLAGS.words_path)
        except OSError as e:
            if e.errno != 17:
                raise

        # recognition Graph
        with self.recognition_graph.as_default():
            model_config, _, _ = self.get_configs_from_exp_dir()
            atser_model = model_builder.build(model_config, is_training=False)
            self.input_image_str_tensor = tf.placeholder(
                dtype=tf.string,
                shape=[])
            input_image_tensor = tf.image.decode_jpeg(
                self.input_image_str_tensor,
                channels=3,
            )
            resized_image_tensor = tf.image.resize_images(
                tf.to_float(input_image_tensor),
                [64, 256])
            predictions_dict = atser_model.predict(tf.expand_dims(resized_image_tensor, 0))
            recognitions = atser_model.postprocess(predictions_dict)
            recognition_text = recognitions['text'][0]

            self.recognition_saver = tf.train.Saver(tf.global_variables())
            recognition_checkpoint = os.path.join(FLAGS.exp_dir, 'log/model.ckpt')

            self.fetches = {
                'original_image': input_image_tensor,
                'recognition_text': recognition_text,
                'control_points': predictions_dict['control_points'],
                'rectified_images': predictions_dict['rectified_images'],
            }
            self.init1, self.init2, self.init3 = tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()
        # detection Graph
        with self.detection_graph.as_default():
            self.input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
            self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0),
                                               trainable=False)

            self.f_score, self.f_geometry = model.detection.model_dc.model(self.input_images, is_training=False)
            self.variable_averages = tf.train.ExponentialMovingAverage(0.997, self.global_step)
            self.detection_saver = tf.train.Saver(self.variable_averages.variables_to_restore())

        self.detection_sess = tf.Session(graph=self.detection_graph, config=tf.ConfigProto(allow_soft_placement=True))
        detection_ckpt_state = tf.train.get_checkpoint_state(checkpoint_dir=FLAGS.checkpoint_path)
        detection_model_path = os.path.join(FLAGS.checkpoint_path,
                                            os.path.basename(detection_ckpt_state.model_checkpoint_path))
        print('Restore from {}'.format(detection_model_path))
        self.detection_saver.restore(self.detection_sess, detection_model_path)
        self.recognition_sess = tf.Session(config=config, graph=self.recognition_graph)

        # self.init1, self.init2, self.init3= tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()

        self.recognition_sess.run([self.init1, self.init2, self.init3])
        self.recognition_saver.restore(self.recognition_sess, recognition_checkpoint)
Exemplo n.º 2
0
 def test_stn_multi_predictor_model_inference(self):
     model_proto = model_pb2.Model()
     text_format.Merge(STN_MULTIPLE_PREDICTOR_MODEL_TEXT_PROTO, model_proto)
     model_object = model_builder.build(model_proto, False)
     test_groundtruth_text_list = [
         tf.constant(b'hello', dtype=tf.string),
         tf.constant(b'world', dtype=tf.string)
     ]
     model_object.provide_groundtruth(
         {'groundtruth_text': test_groundtruth_text_list})
     test_input_image = tf.random_uniform(shape=[2, 32, 100, 3],
                                          minval=0,
                                          maxval=255,
                                          dtype=tf.float32,
                                          seed=1)
     prediction_dict = model_object.predict(
         model_object.preprocess(test_input_image))
     recognition_dict = model_object.postprocess(prediction_dict)
     with self.test_session() as sess:
         sess.run(
             [tf.global_variables_initializer(),
              tf.tables_initializer()])
         outputs = sess.run(recognition_dict)
         print(outputs)
Exemplo n.º 3
0
 def test_single_predictor_model_training(self):
     model_proto = model_pb2.Model()
     text_format.Merge(SINGLE_PREDICTOR_MODEL_TEXT_PROTO, model_proto)
     model_object = model_builder.build(model_proto, True)
     test_groundtruth_text_list = [
         tf.constant(b'hello', dtype=tf.string),
         tf.constant(b'world', dtype=tf.string)
     ]
     model_object.provide_groundtruth(
         {'groundtruth_text': test_groundtruth_text_list})
     test_input_image = tf.random_uniform(shape=[2, 32, 100, 3],
                                          minval=0,
                                          maxval=255,
                                          dtype=tf.float32,
                                          seed=1)
     prediction_dict = model_object.predict(
         model_object.preprocess(test_input_image))
     loss = model_object.loss(prediction_dict)
     with self.test_session() as sess:
         sess.run(
             [tf.global_variables_initializer(),
              tf.tables_initializer()])
         outputs = sess.run({'loss': loss})
         print(outputs['loss'])
Exemplo n.º 4
0
  def test_build_attention_model_single_branch(self):
    model_text_proto = """
    attention_recognition_model {
      feature_extractor {
        convnet {
          crnn_net {
            net_type: SINGLE_BRANCH
            conv_hyperparams {
              op: CONV
              regularizer { l2_regularizer { weight: 1e-4 } }
              initializer { variance_scaling_initializer { } }
              batch_norm { }
            }
            summarize_activations: false
          }
        }
        bidirectional_rnn {
          fw_bw_rnn_cell {
            lstm_cell {
              num_units: 256
              forget_bias: 1.0
              initializer { orthogonal_initializer {} }
            }
          }
          rnn_regularizer { l2_regularizer { weight: 1e-4 } }
          num_output_units: 256
          fc_hyperparams {
            op: FC
            activation: RELU
            initializer { variance_scaling_initializer { } }
            regularizer { l2_regularizer { weight: 1e-4 } }
          }
        }
        summarize_activations: true
      }

      predictor {
        name: "ForwardPredictor"
        bahdanau_attention_predictor {
          reverse: false
          rnn_cell {
            lstm_cell {
              num_units: 256
              forget_bias: 1.0
              initializer { orthogonal_initializer { } }
            }
          }
          rnn_regularizer { l2_regularizer { weight: 1e-4 } }
          num_attention_units: 128
          max_num_steps: 10
          multi_attention: false
          beam_width: 1
          reverse: false
          label_map {
            character_set {
              text_string: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
              delimiter: ""
            }
            label_offset: 2
          }
          loss {
            sequence_cross_entropy_loss {
              sequence_normalize: false
              sample_normalize: true
            }
          }
        }
      }
    }
    """
    model_proto = model_pb2.Model()
    text_format.Merge(model_text_proto, model_proto)
    model_object = model_builder.build(model_proto, True)

    test_groundtruth_text_list = [
      tf.constant(b'hello', dtype=tf.string),
      tf.constant(b'world', dtype=tf.string)]
    model_object.provide_groundtruth(test_groundtruth_text_list)
    test_input_image = tf.random_uniform(
      shape=[2, 32, 100, 3], minval=0, maxval=255,
      dtype=tf.float32, seed=1)
    prediction_dict = model_object.predict(model_object.preprocess(test_input_image))
    loss = model_object.loss(prediction_dict)

    with self.test_session() as sess:
      sess.run([
        tf.global_variables_initializer(),
        tf.tables_initializer()])
      outputs = sess.run({'loss': loss})
      print(outputs['loss'])
Exemplo n.º 5
0
def main(argv=None):
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    try:
        os.makedirs(FLAGS.words_path)
    except OSError as e:
        if e.errno != 17:
            raise

    recognition_graph = tf.Graph()
    detection_graph = tf.Graph()
    # recognition Graph
    with recognition_graph.as_default():
        model_config, _, _ = get_configs_from_exp_dir()
        atser_model = model_builder.build(model_config, is_training=False)
        input_image_str_tensor = tf.placeholder(dtype=tf.string, shape=[])
        input_image_tensor = tf.image.decode_jpeg(
            input_image_str_tensor,
            channels=3,
        )
        resized_image_tensor = tf.image.resize_images(
            tf.to_float(input_image_tensor), [64, 256])
        predictions_dict = atser_model.predict(
            tf.expand_dims(resized_image_tensor, 0))
        recognitions = atser_model.postprocess(predictions_dict)
        recognition_text = recognitions['text'][0]

        recognition_saver = tf.train.Saver(tf.global_variables())
        recognition_checkpoint = os.path.join(FLAGS.exp_dir, 'log/model.ckpt')

        fetches = {
            'original_image': input_image_tensor,
            'recognition_text': recognition_text,
            'control_points': predictions_dict['control_points'],
            'rectified_images': predictions_dict['rectified_images'],
        }
    # detection Graph
    with detection_graph.as_default():
        input_images = tf.placeholder(tf.float32,
                                      shape=[None, None, None, 3],
                                      name='input_images')
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        f_score, f_geometry = model.detection.model_dc.model(input_images,
                                                             is_training=False)
        variable_averages = tf.train.ExponentialMovingAverage(
            0.997, global_step)
        detection_saver = tf.train.Saver(
            variable_averages.variables_to_restore())

    with tf.Session(graph=detection_graph,
                    config=tf.ConfigProto(
                        allow_soft_placement=True)) as detection_sess:
        detection_ckpt_state = tf.train.get_checkpoint_state(
            checkpoint_dir=FLAGS.checkpoint_path)
        detection_model_path = os.path.join(
            FLAGS.checkpoint_path,
            os.path.basename(detection_ckpt_state.model_checkpoint_path))
        print('Restore from {}'.format(detection_model_path))
        detection_saver.restore(detection_sess, detection_model_path)

        with tf.Session(config=config,
                        graph=recognition_graph) as recognition_sess:
            recognition_sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ])
            recognition_saver.restore(recognition_sess, recognition_checkpoint)

            json_dict = {}
            im_fn_list = get_images()
            for im_fn in im_fn_list:
                print(im_fn)
                # im = cv2.imread(im_fn)[:, :, ::-1]
                # image_name = im_fn.split("/")[-1]
                # im_resized, (ratio_h, ratio_w) = resize_image(im)
                # score, geometry = detection_sess.run([f_score, f_geometry], feed_dict={input_images: [im_resized]})
                # boxes= detect(score_map=score, geo_map=geometry)
                #
                # # save to file
                # if boxes is None:
                #     json_dict[image_name] = {
                #         "box_num": 0,
                #         "annotations": []
                #     }
                #     print(json_dict)
                #     continue
                #
                # if boxes is not None:
                #     boxes = boxes[:, :8].reshape((-1, 4, 2))
                #     boxes[:, :, 0] /= ratio_w
                #     boxes[:, :, 1] /= ratio_h
                #
                #
                #

                image_name = im_fn.split("/")[-1]
                im = cv2.imread(im_fn)[:, :, ::-1]

                if im.shape[0] > 1280 or im.shape[1] > 1280:
                    pass

                im_resized, (ratio_h,
                             ratio_w) = resize_image(im, max_side_len=1280)
                boxes = None
                boxes_scores = None
                timer = {'net': 0, 'restore': 0, 'nms': 0}
                # start = time.time()
                score, geometry = detection_sess.run(
                    [f_score, f_geometry],
                    feed_dict={input_images: [im_resized]})

                boxes_0 = detect(score_map=score, geo_map=geometry)

                if boxes_0 is not None:
                    boxes_scores = boxes_0[:, 8]
                    boxes = boxes_0[:, :8].reshape((-1, 4, 2))
                    boxes[:, :, 0] /= ratio_w
                    boxes[:, :, 1] /= ratio_h

                im_resized, (ratio_h, ratio_w) = resize_image(im,
                                                              max_side_len=896)

                score, geometry = detection_sess.run(
                    [f_score, f_geometry],
                    feed_dict={input_images: [im_resized]})

                boxes_1 = detect(score_map=score, geo_map=geometry)

                if boxes_1 is not None:

                    boxes_scores_1 = boxes_1[:, 8] * 2
                    boxes_1 = boxes_1[:, :8].reshape((-1, 4, 2))
                    boxes_1[:, :, 0] /= ratio_w
                    boxes_1[:, :, 1] /= ratio_h
                    if boxes is not None:
                        boxes_scores = np.concatenate(
                            (boxes_scores, boxes_scores_1), axis=0)
                        boxes = np.concatenate((boxes, boxes_1), axis=0)
                    else:
                        boxes_scores = boxes_scores_1
                        boxes = boxes_1

                im_resized, (ratio_h,
                             ratio_w) = resize_image(im, max_side_len=1024)

                score, geometry = detection_sess.run(
                    [f_score, f_geometry],
                    feed_dict={input_images: [im_resized]})

                boxes_2 = detect(score_map=score, geo_map=geometry)

                if boxes_2 is not None:
                    # boxes_scores = np.concatenate((boxes_scores, boxes_1[:, 8]), axis=0)
                    # boxes = np.concatenate((boxes, boxes_1[:, :8].reshape((-1, 4, 2))), axis=0)
                    boxes_scores_2 = boxes_2[:, 8] * 4
                    boxes_2 = boxes_2[:, :8].reshape((-1, 4, 2))
                    boxes_2[:, :, 0] /= ratio_w
                    boxes_2[:, :, 1] /= ratio_h
                    if boxes is not None:
                        boxes_scores = np.concatenate(
                            (boxes_scores, boxes_scores_2), axis=0)
                        boxes = np.concatenate((boxes, boxes_2), axis=0)
                    else:
                        boxes_scores = boxes_scores_2
                        boxes = boxes_2

                if boxes is not None:
                    boxes = ms_standard_nms(boxes, boxes_scores, 0.5)

                if boxes is None:
                    json_dict[image_name] = {"box_num": 0, "annotations": []}
                    print(json_dict)
                    continue

                annotations_list = []
                word_count = 1
                for box in boxes:
                    # to avoid submitting errors
                    box = sort_poly(box.astype(np.int32))
                    if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(
                            box[3] - box[0]) < 5:
                        continue
                    x1 = box[0, 0] if box[0, 0] > 0 else 0
                    y1 = box[0, 1] if box[0, 1] > 0 else 0
                    x2 = box[1, 0] if box[1, 0] > 0 else 0
                    y2 = box[1, 1] if box[1, 1] > 0 else 0
                    x3 = box[2, 0] if box[2, 0] > 0 else 0
                    y3 = box[2, 1] if box[2, 1] > 0 else 0
                    x4 = box[3, 0] if box[3, 0] > 0 else 0
                    y4 = box[3, 1] if box[3, 1] > 0 else 0

                    x_min = min(x1, x2, x3, x4)
                    x_max = max(x1, x2, x3, x4)
                    y_min = min(y1, y2, y3, y4)
                    y_max = max(y1, y2, y3, y4)
                    word_background = np.zeros(
                        (np.int32(y_max - y_min), np.int32(x_max - x_min)),
                        dtype=np.int32)
                    poly_area = np.array([[x1 - x_min, y1 - y_min],
                                          [x2 - x_min, y2 - y_min],
                                          [x3 - x_min, y3 - y_min],
                                          [x4 - x_min, y4 - y_min]])
                    cv2.fillPoly(word_background, np.int32([poly_area]), 1)
                    word_area = np.copy(im[y_min:y_max, x_min:x_max])
                    word_name = re.sub(".jpg",
                                       "_word_" + str(word_count) + ".jpg",
                                       im_fn)
                    word_name = FLAGS.words_path + word_name.split("/")[-1]
                    try:
                        word_area[:, :, 0] *= np.uint8(word_background)
                        word_area[:, :, 1] *= np.uint8(word_background)
                        word_area[:, :, 2] *= np.uint8(word_background)

                    except Exception as e:
                        print(
                            '\033[0;31m', word_name,
                            "Dividing encounters error, store the origin cropped part.",
                            "\033[0m", e)
                        print(word_area.shape, box)
                    cv2.imwrite(filename=word_name, img=word_area)
                    word_count += 1
                    with open(word_name, "rb") as f:
                        input_image_str = f.read()
                    sess_outputs = recognition_sess.run(
                        fetches,
                        feed_dict={input_image_str_tensor: input_image_str})
                    annotations_list.append({
                        "text":
                        sess_outputs['recognition_text'].decode('utf-8'),
                        "bbox": [
                            int(x4),
                            int(y4),
                            int(x1),
                            int(y1),
                            int(x2),
                            int(y2),
                            int(x3),
                            int(y3)
                        ]
                    })
                json_dict[image_name] = {
                    "box_num": boxes.shape[0],
                    "annotations": annotations_list
                }
            # print(json_dict)
            with open("scenetext_result/scenetext_result.json", "w") as f:
                json.dump(json_dict, f)