def load_model(self, argv=None): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list config = tf.ConfigProto() config.gpu_options.allow_growth = True try: os.makedirs(FLAGS.words_path) except OSError as e: if e.errno != 17: raise # recognition Graph with self.recognition_graph.as_default(): model_config, _, _ = self.get_configs_from_exp_dir() atser_model = model_builder.build(model_config, is_training=False) self.input_image_str_tensor = tf.placeholder( dtype=tf.string, shape=[]) input_image_tensor = tf.image.decode_jpeg( self.input_image_str_tensor, channels=3, ) resized_image_tensor = tf.image.resize_images( tf.to_float(input_image_tensor), [64, 256]) predictions_dict = atser_model.predict(tf.expand_dims(resized_image_tensor, 0)) recognitions = atser_model.postprocess(predictions_dict) recognition_text = recognitions['text'][0] self.recognition_saver = tf.train.Saver(tf.global_variables()) recognition_checkpoint = os.path.join(FLAGS.exp_dir, 'log/model.ckpt') self.fetches = { 'original_image': input_image_tensor, 'recognition_text': recognition_text, 'control_points': predictions_dict['control_points'], 'rectified_images': predictions_dict['rectified_images'], } self.init1, self.init2, self.init3 = tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() # detection Graph with self.detection_graph.as_default(): self.input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) self.f_score, self.f_geometry = model.detection.model_dc.model(self.input_images, is_training=False) self.variable_averages = tf.train.ExponentialMovingAverage(0.997, self.global_step) self.detection_saver = tf.train.Saver(self.variable_averages.variables_to_restore()) self.detection_sess = tf.Session(graph=self.detection_graph, config=tf.ConfigProto(allow_soft_placement=True)) detection_ckpt_state = tf.train.get_checkpoint_state(checkpoint_dir=FLAGS.checkpoint_path) detection_model_path = os.path.join(FLAGS.checkpoint_path, os.path.basename(detection_ckpt_state.model_checkpoint_path)) print('Restore from {}'.format(detection_model_path)) self.detection_saver.restore(self.detection_sess, detection_model_path) self.recognition_sess = tf.Session(config=config, graph=self.recognition_graph) # self.init1, self.init2, self.init3= tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() self.recognition_sess.run([self.init1, self.init2, self.init3]) self.recognition_saver.restore(self.recognition_sess, recognition_checkpoint)
def test_stn_multi_predictor_model_inference(self): model_proto = model_pb2.Model() text_format.Merge(STN_MULTIPLE_PREDICTOR_MODEL_TEXT_PROTO, model_proto) model_object = model_builder.build(model_proto, False) test_groundtruth_text_list = [ tf.constant(b'hello', dtype=tf.string), tf.constant(b'world', dtype=tf.string) ] model_object.provide_groundtruth( {'groundtruth_text': test_groundtruth_text_list}) test_input_image = tf.random_uniform(shape=[2, 32, 100, 3], minval=0, maxval=255, dtype=tf.float32, seed=1) prediction_dict = model_object.predict( model_object.preprocess(test_input_image)) recognition_dict = model_object.postprocess(prediction_dict) with self.test_session() as sess: sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) outputs = sess.run(recognition_dict) print(outputs)
def test_single_predictor_model_training(self): model_proto = model_pb2.Model() text_format.Merge(SINGLE_PREDICTOR_MODEL_TEXT_PROTO, model_proto) model_object = model_builder.build(model_proto, True) test_groundtruth_text_list = [ tf.constant(b'hello', dtype=tf.string), tf.constant(b'world', dtype=tf.string) ] model_object.provide_groundtruth( {'groundtruth_text': test_groundtruth_text_list}) test_input_image = tf.random_uniform(shape=[2, 32, 100, 3], minval=0, maxval=255, dtype=tf.float32, seed=1) prediction_dict = model_object.predict( model_object.preprocess(test_input_image)) loss = model_object.loss(prediction_dict) with self.test_session() as sess: sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) outputs = sess.run({'loss': loss}) print(outputs['loss'])
def test_build_attention_model_single_branch(self): model_text_proto = """ attention_recognition_model { feature_extractor { convnet { crnn_net { net_type: SINGLE_BRANCH conv_hyperparams { op: CONV regularizer { l2_regularizer { weight: 1e-4 } } initializer { variance_scaling_initializer { } } batch_norm { } } summarize_activations: false } } bidirectional_rnn { fw_bw_rnn_cell { lstm_cell { num_units: 256 forget_bias: 1.0 initializer { orthogonal_initializer {} } } } rnn_regularizer { l2_regularizer { weight: 1e-4 } } num_output_units: 256 fc_hyperparams { op: FC activation: RELU initializer { variance_scaling_initializer { } } regularizer { l2_regularizer { weight: 1e-4 } } } } summarize_activations: true } predictor { name: "ForwardPredictor" bahdanau_attention_predictor { reverse: false rnn_cell { lstm_cell { num_units: 256 forget_bias: 1.0 initializer { orthogonal_initializer { } } } } rnn_regularizer { l2_regularizer { weight: 1e-4 } } num_attention_units: 128 max_num_steps: 10 multi_attention: false beam_width: 1 reverse: false label_map { character_set { text_string: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" delimiter: "" } label_offset: 2 } loss { sequence_cross_entropy_loss { sequence_normalize: false sample_normalize: true } } } } } """ model_proto = model_pb2.Model() text_format.Merge(model_text_proto, model_proto) model_object = model_builder.build(model_proto, True) test_groundtruth_text_list = [ tf.constant(b'hello', dtype=tf.string), tf.constant(b'world', dtype=tf.string)] model_object.provide_groundtruth(test_groundtruth_text_list) test_input_image = tf.random_uniform( shape=[2, 32, 100, 3], minval=0, maxval=255, dtype=tf.float32, seed=1) prediction_dict = model_object.predict(model_object.preprocess(test_input_image)) loss = model_object.loss(prediction_dict) with self.test_session() as sess: sess.run([ tf.global_variables_initializer(), tf.tables_initializer()]) outputs = sess.run({'loss': loss}) print(outputs['loss'])
def main(argv=None): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list config = tf.ConfigProto() config.gpu_options.allow_growth = True try: os.makedirs(FLAGS.words_path) except OSError as e: if e.errno != 17: raise recognition_graph = tf.Graph() detection_graph = tf.Graph() # recognition Graph with recognition_graph.as_default(): model_config, _, _ = get_configs_from_exp_dir() atser_model = model_builder.build(model_config, is_training=False) input_image_str_tensor = tf.placeholder(dtype=tf.string, shape=[]) input_image_tensor = tf.image.decode_jpeg( input_image_str_tensor, channels=3, ) resized_image_tensor = tf.image.resize_images( tf.to_float(input_image_tensor), [64, 256]) predictions_dict = atser_model.predict( tf.expand_dims(resized_image_tensor, 0)) recognitions = atser_model.postprocess(predictions_dict) recognition_text = recognitions['text'][0] recognition_saver = tf.train.Saver(tf.global_variables()) recognition_checkpoint = os.path.join(FLAGS.exp_dir, 'log/model.ckpt') fetches = { 'original_image': input_image_tensor, 'recognition_text': recognition_text, 'control_points': predictions_dict['control_points'], 'rectified_images': predictions_dict['rectified_images'], } # detection Graph with detection_graph.as_default(): input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) f_score, f_geometry = model.detection.model_dc.model(input_images, is_training=False) variable_averages = tf.train.ExponentialMovingAverage( 0.997, global_step) detection_saver = tf.train.Saver( variable_averages.variables_to_restore()) with tf.Session(graph=detection_graph, config=tf.ConfigProto( allow_soft_placement=True)) as detection_sess: detection_ckpt_state = tf.train.get_checkpoint_state( checkpoint_dir=FLAGS.checkpoint_path) detection_model_path = os.path.join( FLAGS.checkpoint_path, os.path.basename(detection_ckpt_state.model_checkpoint_path)) print('Restore from {}'.format(detection_model_path)) detection_saver.restore(detection_sess, detection_model_path) with tf.Session(config=config, graph=recognition_graph) as recognition_sess: recognition_sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) recognition_saver.restore(recognition_sess, recognition_checkpoint) json_dict = {} im_fn_list = get_images() for im_fn in im_fn_list: print(im_fn) # im = cv2.imread(im_fn)[:, :, ::-1] # image_name = im_fn.split("/")[-1] # im_resized, (ratio_h, ratio_w) = resize_image(im) # score, geometry = detection_sess.run([f_score, f_geometry], feed_dict={input_images: [im_resized]}) # boxes= detect(score_map=score, geo_map=geometry) # # # save to file # if boxes is None: # json_dict[image_name] = { # "box_num": 0, # "annotations": [] # } # print(json_dict) # continue # # if boxes is not None: # boxes = boxes[:, :8].reshape((-1, 4, 2)) # boxes[:, :, 0] /= ratio_w # boxes[:, :, 1] /= ratio_h # # # image_name = im_fn.split("/")[-1] im = cv2.imread(im_fn)[:, :, ::-1] if im.shape[0] > 1280 or im.shape[1] > 1280: pass im_resized, (ratio_h, ratio_w) = resize_image(im, max_side_len=1280) boxes = None boxes_scores = None timer = {'net': 0, 'restore': 0, 'nms': 0} # start = time.time() score, geometry = detection_sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized]}) boxes_0 = detect(score_map=score, geo_map=geometry) if boxes_0 is not None: boxes_scores = boxes_0[:, 8] boxes = boxes_0[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h im_resized, (ratio_h, ratio_w) = resize_image(im, max_side_len=896) score, geometry = detection_sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized]}) boxes_1 = detect(score_map=score, geo_map=geometry) if boxes_1 is not None: boxes_scores_1 = boxes_1[:, 8] * 2 boxes_1 = boxes_1[:, :8].reshape((-1, 4, 2)) boxes_1[:, :, 0] /= ratio_w boxes_1[:, :, 1] /= ratio_h if boxes is not None: boxes_scores = np.concatenate( (boxes_scores, boxes_scores_1), axis=0) boxes = np.concatenate((boxes, boxes_1), axis=0) else: boxes_scores = boxes_scores_1 boxes = boxes_1 im_resized, (ratio_h, ratio_w) = resize_image(im, max_side_len=1024) score, geometry = detection_sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized]}) boxes_2 = detect(score_map=score, geo_map=geometry) if boxes_2 is not None: # boxes_scores = np.concatenate((boxes_scores, boxes_1[:, 8]), axis=0) # boxes = np.concatenate((boxes, boxes_1[:, :8].reshape((-1, 4, 2))), axis=0) boxes_scores_2 = boxes_2[:, 8] * 4 boxes_2 = boxes_2[:, :8].reshape((-1, 4, 2)) boxes_2[:, :, 0] /= ratio_w boxes_2[:, :, 1] /= ratio_h if boxes is not None: boxes_scores = np.concatenate( (boxes_scores, boxes_scores_2), axis=0) boxes = np.concatenate((boxes, boxes_2), axis=0) else: boxes_scores = boxes_scores_2 boxes = boxes_2 if boxes is not None: boxes = ms_standard_nms(boxes, boxes_scores, 0.5) if boxes is None: json_dict[image_name] = {"box_num": 0, "annotations": []} print(json_dict) continue annotations_list = [] word_count = 1 for box in boxes: # to avoid submitting errors box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue x1 = box[0, 0] if box[0, 0] > 0 else 0 y1 = box[0, 1] if box[0, 1] > 0 else 0 x2 = box[1, 0] if box[1, 0] > 0 else 0 y2 = box[1, 1] if box[1, 1] > 0 else 0 x3 = box[2, 0] if box[2, 0] > 0 else 0 y3 = box[2, 1] if box[2, 1] > 0 else 0 x4 = box[3, 0] if box[3, 0] > 0 else 0 y4 = box[3, 1] if box[3, 1] > 0 else 0 x_min = min(x1, x2, x3, x4) x_max = max(x1, x2, x3, x4) y_min = min(y1, y2, y3, y4) y_max = max(y1, y2, y3, y4) word_background = np.zeros( (np.int32(y_max - y_min), np.int32(x_max - x_min)), dtype=np.int32) poly_area = np.array([[x1 - x_min, y1 - y_min], [x2 - x_min, y2 - y_min], [x3 - x_min, y3 - y_min], [x4 - x_min, y4 - y_min]]) cv2.fillPoly(word_background, np.int32([poly_area]), 1) word_area = np.copy(im[y_min:y_max, x_min:x_max]) word_name = re.sub(".jpg", "_word_" + str(word_count) + ".jpg", im_fn) word_name = FLAGS.words_path + word_name.split("/")[-1] try: word_area[:, :, 0] *= np.uint8(word_background) word_area[:, :, 1] *= np.uint8(word_background) word_area[:, :, 2] *= np.uint8(word_background) except Exception as e: print( '\033[0;31m', word_name, "Dividing encounters error, store the origin cropped part.", "\033[0m", e) print(word_area.shape, box) cv2.imwrite(filename=word_name, img=word_area) word_count += 1 with open(word_name, "rb") as f: input_image_str = f.read() sess_outputs = recognition_sess.run( fetches, feed_dict={input_image_str_tensor: input_image_str}) annotations_list.append({ "text": sess_outputs['recognition_text'].decode('utf-8'), "bbox": [ int(x4), int(y4), int(x1), int(y1), int(x2), int(y2), int(x3), int(y3) ] }) json_dict[image_name] = { "box_num": boxes.shape[0], "annotations": annotations_list } # print(json_dict) with open("scenetext_result/scenetext_result.json", "w") as f: json.dump(json_dict, f)