def generate_book_page_imgs(obj_num=10, text_type="horizontal", page_shape=None): text_type = check_text_type(text_type) if text_type == "h": book_page_imgs_dir, book_page_tags_file = BOOK_PAGE_IMGS_H, BOOK_PAGE_TAGS_FILE_H if text_type == "v": book_page_imgs_dir, book_page_tags_file = BOOK_PAGE_IMGS_V, BOOK_PAGE_TAGS_FILE_V check_or_makedirs(book_page_imgs_dir) _shape = page_shape with open(book_page_tags_file, "w", encoding="utf-8") as fw: for i in range(obj_num): if page_shape is None and text_type == "h": _shape = (random.randint(480, 720), random.randint(640, 960)) if page_shape is None and text_type == "v": _shape = (random.randint(640, 960), random.randint(480, 720)) PIL_page, text_bbox_list, split_pos_list = create_book_page( _shape, text_type=text_type) image_tags = { "text_bbox_list": text_bbox_list, "split_pos_list": split_pos_list } img_name = "book_page_%d.jpg" % i save_path = os.path.join(book_page_imgs_dir, img_name) PIL_page.save(save_path, format="jpeg") fw.write(img_name + "\t" + json.dumps(image_tags) + "\n") if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush()
def generate_one_text_line_imgs(obj_num=100, text_type="horizontal", text_shape=None): text_type = check_text_type(text_type) if text_type == "h": text_line_imgs_dir, text_line_tags_file = ONE_TEXT_LINE_IMGS_H, ONE_TEXT_LINE_TAGS_FILE_H elif text_type == "v": text_line_imgs_dir, text_line_tags_file = ONE_TEXT_LINE_IMGS_V, ONE_TEXT_LINE_TAGS_FILE_V else: raise ValueError check_or_makedirs(text_line_imgs_dir) _shape = text_shape with open(text_line_tags_file, "w", encoding="utf-8") as fw: for i in range(obj_num): if text_shape is None and text_type == "h": _shape = (random.randint(38, 72), random.randint(540, 1280)) if text_shape is None and text_type == "v": _shape = (random.randint(540, 1280), random.randint(38, 72)) PIL_text, char_and_box_list, split_pos_list = create_one_text_line(_shape, text_type=text_type) image_tags = {"char_and_box_list": char_and_box_list, "split_pos_list": split_pos_list} img_name = "text_line_%d.jpg" % i save_path = os.path.join(text_line_imgs_dir, img_name) PIL_text.save(save_path, format="jpeg") fw.write(img_name + "\t" + json.dumps(image_tags) + "\n") if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush()
def generate_two_text_line_imgs(obj_num=100, text_type="horizontal", text_shape=None): text_type = check_text_type(text_type) if text_type == "h": text_line_imgs_dir, text_line_tags_file = TWO_TEXT_LINE_IMGS_H, TWO_TEXT_LINE_TAGS_FILE_H elif text_type == "v": text_line_imgs_dir, text_line_tags_file = TWO_TEXT_LINE_IMGS_V, TWO_TEXT_LINE_TAGS_FILE_V else: raise ValueError check_or_makedirs(text_line_imgs_dir) _shape = text_shape with open(text_line_tags_file, "w", encoding="utf-8") as fw: for i in range(obj_num): if text_shape is None and text_type == "h": _shape = (random.randint(64, 108), random.randint(108, 1024)) # 双行文本数据无需太长 if text_shape is None and text_type == "v": _shape = (random.randint(108, 1024), random.randint(64, 108)) # 双行文本数据无需太长 # 训练双行文本的切分,既需要生成双行数据,也需要生成单行数据(不切分的情况) PIL_text, split_pos_list = create_two_text_line(_shape, text_type=text_type) image_tags = {"split_pos_list": split_pos_list} img_name = "text_line_%d.jpg" % i save_path = os.path.join(text_line_imgs_dir, img_name) PIL_text.save(save_path, format="jpeg") fw.write(img_name + "\t" + json.dumps(image_tags) + "\n") if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush()
def detect_image(self, img_path, dest_dir, background="white"): if not os.path.exists(img_path): return img_name = os.path.basename(img_path) check_or_makedirs(dest_dir) PIL_img = Image.open(img_path) if PIL_img.mode != "L": PIL_img = PIL_img.convert("L") np_img = np.array(PIL_img, dtype=np.uint8) h, w = np_img.shape[:2] new_h = -h % 32 + h new_w = -w % 32 + w batch_imgs = np.empty(shape=(1, new_h, new_w), dtype=np.float32) if background == "white": batch_imgs.fill(255) elif background == "black": batch_imgs.fill(0) else: ValueError("Optional image background: 'white', 'black'.") batch_imgs[0, :h, :w] = np_img batch_imgs = np.expand_dims(batch_imgs, axis=-1) start = timer() # 起始时间 out_boxes, out_scores, out_classes = self.predict_model.predict( x=batch_imgs) print('Time {:.2f}s, found {} boxes in {}'.format( timer() - start, len(out_boxes), img_name)) np_img_rgb = draw_boxes(np_img, out_boxes, out_scores, out_classes) PIL_img = Image.fromarray(np_img_rgb) PIL_img.save(os.path.join(dest_dir, img_name), format="jpeg")
def generate_mix_text_line_tfrecords(obj_num=100, text_type="horizontal", text_shape=None): text_type = check_text_type(text_type) if text_type == "h": text_line_tfrecords_dir = MIX_TEXT_LINE_TFRECORDS_H if text_type == "v": text_line_tfrecords_dir = MIX_TEXT_LINE_TFRECORDS_V check_or_makedirs(text_line_tfrecords_dir) # 可以把生成的图片直接存入tfrecords文件 # 而不必将生成的图片先保存到磁盘,再从磁盘读取出来保存到tfrecords文件,这样效率太低 writers_list = \ [tf.io.TFRecordWriter(os.path.join(text_line_tfrecords_dir, "text_lines_%d.tfrecords" % i)) for i in range(20)] # 保存生成的文本图片 _shape = text_shape for i in range(obj_num): writer = random.choice(writers_list) if text_shape is None and text_type == "h": _shape = (random.randint(54, 108), random.randint(720, 1280)) if text_shape is None and text_type == "v": _shape = (random.randint(720, 1280), random.randint(54, 108)) PIL_text, _, split_pos_list = create_mix_text_line(_shape, text_type=text_type) bytes_image = PIL_text.tobytes() # 将图片转化为原生bytes split_positions = np.array(split_pos_list, dtype=np.int32).tobytes() example = tf.train.Example(features=tf.train.Features( feature={ 'bytes_image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_image])), 'img_height': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_text.height])), 'img_width': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_text.width])), 'split_positions': tf.train.Feature(bytes_list=tf.train.BytesList( value=[split_positions])) })) writer.write(example.SerializeToString()) if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush() # 关闭所有的tfrecords写者 [writer.close() for writer in writers_list] return
def main(img_path, dest_dir, segment_task="book_page", text_type="horizontal", model_struc="densenet_gru", weights_path=""): check_or_makedirs(dest_dir) K.set_learning_phase(False) _, fixed_shape, feat_stride = get_segment_task_params(segment_task) _, ckpt_dir, logs_dir = get_segment_task_path(segment_task) if not os.path.exists(weights_path): weights_path = os.path.join(ckpt_dir, model_struc + "_ctpn_finished.h5") assert os.path.exists(weights_path) # 加载模型 segment_model = work_net(stage="predict", segment_task=segment_task, text_type=text_type, model_struc=model_struc) segment_model.load_weights(weights_path, by_name=True) print("\nLoad model weights from %s\n" % weights_path) # ctpn_model.summary() count = 0 for raw_np_img, img_name in load_images(img_path): count += 1 np_img, _, scale_ratio = adjust_img_to_fixed_shape( raw_np_img, fixed_shape=fixed_shape, feat_stride=feat_stride, segment_task=segment_task, text_type=text_type) batch_images = np_img[np.newaxis, :, :, :] split_positions, scores = segment_model.predict(x=batch_images) # 模型预测 text_type = text_type[0].lower() if (segment_task, text_type) in (("book_page", "h"), ("double_line", "h"), ("text_line", "v"), ("mix_line", "v")): _, split_positions = restore_original_angle( np_img=None, pred_split_positions=split_positions) split_positions = split_positions / scale_ratio image = visualize.draw_split_lines(raw_np_img, split_positions, scores) # 可视化 PIL_img = Image.fromarray(image) dest_path = os.path.join(dest_dir, os.path.splitext(img_name)[0] + ".jpg") PIL_img.save(dest_path, format="jpeg") print(count, "Finished: " + dest_path)
def main(img_path, dest_dir, text_type="vertical", weights_path=TRAIN_FINISHED_WEIGHTS): check_or_makedirs(dest_dir) K.set_learning_phase(False) assert os.path.exists(weights_path) and text_type in weights_path # 加载模型 ctpn_model = work_net("predict", batch_size=1, text_type=text_type, model_struc="densenet_gru") ctpn_model.load_weights(weights_path, by_name=True) print("\nLoad model weights from %s\n" % weights_path) # ctpn_model.summary() count = 0 for np_img, img_name in load_images(img_path): count += 1 np_img = adjust_img_into_model(np_img, text_type=text_type, fixed_size=BOOK_PAGE_FIXED_SIZE) batch_images = np_img[np.newaxis, :, :, :] # 模型预测 boxes, scores = ctpn_model.predict(x=batch_images) boxes = np_utils.remove_pad(boxes[0]) scores = np_utils.remove_pad(scores[0])[:, 0] # 文本行检测器 textdetector = TextDetector(DETECT_MODE='H') text_lines = textdetector.detect(boxes, scores, np_img.shape[:2]) # 可视化 np_img = visualize.draw_text_lines(np_img, text_lines) if text_type.lower() in ("v", "vertical"): np_img = restore_text_horizontal_to_vertical(np_img) PIL_img = Image.fromarray(np_img) dest_path = os.path.join(dest_dir, os.path.splitext(img_name)[0] + ".jpg") PIL_img.save(dest_path, format="jpeg") print(count, "Finished: " + dest_path)
def check_tags(tags_file, segment_task, text_type): with open(tags_file, "r", encoding="utf8") as fr: lines = [line.strip() for line in fr.readlines()] save_path = os.path.join(SEGMENT_BOOK_PAGE_ROOT_DIR, "samples") check_or_makedirs(save_path) for i, line in enumerate(lines): np_img, split_pos = get_image_and_split_pos(line, segment_task="book_page") text_type = text_type[0].lower() if (segment_task, text_type) in (("book_page", "h"), ("double_line", "h"), ("text_line", "v"), ("mix_line", "v")): np_img, split_pos = rotate_90_degrees(np_img, split_pos) np_img = draw_split_lines(np_img, split_pos) PIL_img = Image.fromarray(np_img) PIL_img.save(os.path.join(save_path, str(i) + ".jpg"))
def get_callbacks(model_struc="densenet_gru", text_type="horizontal"): check_or_makedirs(dir_name=CTPN_CKPT_DIR) checkpoint = ModelCheckpoint(filepath=os.path.join(CTPN_CKPT_DIR, model_struc + "_" + text_type + "_ctpn_{epoch:04d}.h5"), monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True) lr_reducer = ReduceLROnPlateau(monitor='loss', factor=0.1, cooldown=0, patience=10, min_lr=1e-4) check_or_makedirs(CTPN_LOGS_DIR) logs = TensorBoard(log_dir=CTPN_LOGS_DIR) return [checkpoint, lr_reducer, logs]
def train(num_epochs, start_epoch=0, model_type="horizontal", model_struc="resnet_lstm"): backend.set_learning_phase(True) crnn = CRNN(model_type=model_type, model_struc=model_struc) model = crnn.model_for_training() model.compile(optimizer=optimizers.Adagrad(learning_rate=0.01), loss={ "ctc_loss": lambda y_true, out_loss: out_loss }) if start_epoch > 0: weights_prefix = os.path.join( CRNN_CKPT_DIR, model_type + "_" + model_struc + "_crnn_weights_%05d_" % start_epoch) model.load_weights(filepath=weights_prefix) check_or_makedirs(CRNN_CKPT_DIR) ckpt_path = os.path.join( CRNN_CKPT_DIR, model_type + "_" + model_struc + "_crnn_weights_{epoch:05d}_{val_loss:.2f}.tf") checkpoint = callbacks.ModelCheckpoint(filepath=ckpt_path, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode="min") model.fit_generator( generator=create_text_lines_batch(type=model_type, batch_size=BATCH_SIZE_TEXT_LINE), steps_per_epoch=100, epochs=start_epoch + num_epochs, verbose=1, callbacks=[checkpoint], validation_data=load_text_lines_batch(type=model_type, batch_size=BATCH_SIZE_TEXT_LINE), validation_steps=50, max_queue_size=50, workers=2, use_multiprocessing=True, initial_epoch=start_epoch)
def convert_annotation(img_sources=None, tfrecords_dir=None, dest_file=None): assert [img_sources, tfrecords_dir].count(None) == 1 check_or_makedirs(os.path.dirname(dest_file)) with open(dest_file, "w", encoding="utf-8") as fw: if img_sources is not None: for src_file, root_dir in img_sources: with open(src_file, "r", encoding="utf-8") as fr: for line in fr: img_name, tags_str = line.strip().split("\t") img_path = os.path.join(root_dir, img_name) fw.write(img_path + "\t" + tags_str + "\n") elif tfrecords_dir is not None: assert os.path.exists(tfrecords_dir) for file in os.listdir(tfrecords_dir): if file.endswith(".tfrecords"): file_path = os.path.join(tfrecords_dir, file) fw.write(file_path + "\n")
def generate_book_page_imgs_with_img(obj_num=10, text_type="horizontal", init_num=0, page_shape=None): text_type = check_text_type(text_type) if text_type == "h": book_page_imgs_dir, book_page_tags_file = BOOK_PAGE_IMGS_H, BOOK_PAGE_TAGS_FILE_H elif text_type == "v": book_page_imgs_dir, book_page_tags_file = BOOK_PAGE_IMGS_V, BOOK_PAGE_TAGS_FILE_V else: raise ValueError('text_type should be horizontal or vertical') check_or_makedirs(book_page_imgs_dir) _shape = page_shape with open(book_page_tags_file, "w", encoding="utf-8") as fw: for i in range(init_num, init_num + obj_num): ''' if page_shape is None and text_type == "h": _shape = (random.randint(480, 720), random.randint(640, 960)) if page_shape is None and text_type == "v": _shape = (random.randint(640, 960), random.randint(480, 720)) ''' if page_shape is None: _shape = random.choice(BOOK_PAGE_SHAPE_LIST) PIL_page, text_bbox_list, split_pos_list = create_book_page_with_img( _shape, text_type=text_type) image_tags = { "text_bbox_list": text_bbox_list, "split_pos_list": split_pos_list } img_name = "book_page_%d.jpg" % i save_path = os.path.join(book_page_imgs_dir, img_name) PIL_page.save(save_path, format="jpeg") fw.write(img_name + "\t" + json.dumps(image_tags) + "\n") if i % 50 == 0: print(" %d / %d Done" % (i, obj_num)) sys.stdout.flush()
def extract_annotation(imgs_dir=None, tfrecords_dir=None, dest_file=None): assert [imgs_dir, tfrecords_dir].count(None) == 1 check_or_makedirs(os.path.dirname(dest_file)) with open(dest_file, "w", encoding="utf-8") as fw: if imgs_dir is not None: for root, dirs, files_list in os.walk(imgs_dir): if len(files_list) > 0: for file_name in files_list: if file_name.lower()[-4:] in (".gif", ".jpg", ".png"): image_path = os.path.join(root, file_name) fw.write(image_path + "\n") elif tfrecords_dir is not None: assert os.path.exists(tfrecords_dir) for file in os.listdir(tfrecords_dir): if file.endswith(".tfrecords"): file_path = os.path.join(tfrecords_dir, file) fw.write(file_path + "\n")
def get_callbacks(model_struc="densenet_gru"): check_or_makedirs(dir_name=CHAR_RECOG_CKPT_DIR) checkpoint = ModelCheckpoint(filepath=os.path.join( CHAR_RECOG_CKPT_DIR, "char_recog_with_compo_" + model_struc + "_{epoch:04d}.h5"), monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True) lr_reducer = ReduceLROnPlateau( monitor='loss', factor=0.5, patience=10, # num of epochs cooldown=0, min_lr=0) check_or_makedirs(CHAR_RECOG_LOGS_DIR) logs = TensorBoard(log_dir=CHAR_RECOG_LOGS_DIR) return [checkpoint, lr_reducer, logs]
def get_callbacks(segment_task, model_struc="densenet_gru"): _, ckpt_dir, logs_dir = get_segment_task_path(segment_task) check_or_makedirs(dir_name=ckpt_dir) checkpoint = ModelCheckpoint( filepath=os.path.join(ckpt_dir, segment_task + "_segment_" + model_struc + "_{epoch:04d}.h5"), monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.6, cooldown=0, patience=4, # num of epochs min_lr=0) check_or_makedirs(logs_dir) logs = TensorBoard(log_dir=logs_dir) return [checkpoint, lr_reducer, logs]
def generate_tfrecords(obj_size=CHAR_IMG_SIZE, num_imgs_per_font=NUM_IMAGES_PER_FONT): print("Get font_file_list ...") font_file_list = [ os.path.join(FONT_FILE_DIR, font_name) for font_name in os.listdir(FONT_FILE_DIR) if font_name.lower()[-4:] in (".otf", ".ttf", ".ttc", ".fon") ] # 创建保存tfrecords文件的目录 check_or_makedirs(CHAR_TFRECORDS_DIR) # 可以把生成的图片直接存入tfrecords文件 # 不必将生成的图片先保存到磁盘,再从磁盘读取出来保存到tfrecords文件,这样效率太低 # 通常是用某种字体对一个字生成很多个增强的图片,这些图片最好是分开存放 # 若直接把同一字体同一个字的多张图片连续放到同一个tfrecords里,那么训练batch的多样性不好 writers_list = \ [tf.io.TFRecordWriter(os.path.join(CHAR_TFRECORDS_DIR, "chinese_imgs_%d_from_font.tfrecords" % i)) for i in range(20)] print("Begin to generate images ...") chinese_char_num = len(CHAR2ID_DICT) total_num = len(font_file_list) * chinese_char_num count = 0 for font_file in font_file_list: # 外层循环是字体 for chinese_char, bigger_PIL_img in generate_all_chinese_images_bigger( font_file, image_size=int(obj_size * 1.2)): # 内层循环是字 # 检查生成的灰度图像是否可用,黑底白字 image_data = list(bigger_PIL_img.getdata()) if sum(image_data) < 10: continue PIL_img_list = \ [get_augmented_image(bigger_PIL_img, obj_size, rotation=True, dilate=False, erode=True, reverse_color=True) for i in range(num_imgs_per_font)] # 保存生成的字体图片 for PIL_img in PIL_img_list: writer = random.choice(writers_list) bytes_image = PIL_img.tobytes() # 将图片转化为原生bytes bytes_char = chinese_char.encode('utf-8') example = tf.train.Example(features=tf.train.Features( feature={ 'bytes_image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_image])), 'img_height': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_img.height])), 'img_width': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_img.width])), 'bytes_char': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_char])) })) writer.write(example.SerializeToString()) # 当前进度 count += 1 if count % 200 == 0: print("Progress bar: %.2f%%" % (count * 100 / total_num)) sys.stdout.flush() # 关闭所有的tfrecords写者 [writer.close() for writer in writers_list]
def convert_tfrecords(obj_size=CHAR_IMG_SIZE, num_imgs_per_font=NUM_IMAGES_PER_FONT): print("Get total images num ...") font_images_num_list = [ len(os.listdir(os.path.join(EXTERNEL_IMAGES_DIR, content))) for content in os.listdir(EXTERNEL_IMAGES_DIR) if os.path.isdir(os.path.join(EXTERNEL_IMAGES_DIR, content)) ] # 创建保存tfrecords文件的目录 check_or_makedirs(CHAR_TFRECORDS_DIR) # 可以把变换的图片直接存入tfrecords文件 # 不必将变换的图片先保存到磁盘,再从磁盘读取出来保存到tfrecords文件,这样效率太低 # 通常是用一种字体的一个字图片增强出很多个图片,这些图片最好是分开存放 # 若直接把同一字体同一个字图片增强出的多张图片连续放到同一个tfrecords里,那么每一个训练batch的多样性就不好 writers_list = \ [tf.io.TFRecordWriter(os.path.join(CHAR_TFRECORDS_DIR, "chinese_imgs_%d_from_img.tfrecords" % i)) for i in range(20)] print("Begin to convert images ...") total_num = sum(font_images_num_list) count = 0 for font_type, image_paths_list in get_external_image_paths( root_dir=EXTERNEL_IMAGES_DIR): for image_path in image_paths_list: chinese_char = os.path.basename(image_path)[0] # 加载外部图片,将图片调整为正方形 # 为了保证图片旋转时不丢失信息,生成的图片比本来的图片稍微bigger # 为了方便图片的后续处理,图片必须加载为黑底白字,可以用reverse_color来调整 try: bigger_PIL_img = load_external_image_bigger( image_path, white_background=True, reverse_color=True) except OSError: print("The image %s result in OSError !" % image_path) continue PIL_img_list = \ [get_augmented_image(bigger_PIL_img, obj_size, rotation=True, dilate=False, erode=True, reverse_color=True) for i in range(num_imgs_per_font)] # 保存生成的字体图片 for index, PIL_img in enumerate(PIL_img_list): # train_set和test_set的比例约为 5:1 writer = random.choice(writers_list) bytes_image = PIL_img.tobytes() # 将图片转化为原生bytes bytes_char = chinese_char.encode('utf-8') example = tf.train.Example(features=tf.train.Features( feature={ 'bytes_image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_image])), 'bytes_char': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_char])), })) writer.write(example.SerializeToString()) # 当前进度 count += 1 if count % 200 == 0: print("Progress bar: %.2f%%" % (count * 100 / total_num)) sys.stdout.flush() # 关闭所有的 tfrecords writer [writer.close() for writer in writers_list]
def generate_book_page_tfrecords(obj_num=10, text_type="horizontal", init_num=0, page_shape=None): text_type = check_text_type(text_type) if text_type == "h": book_page_tfrecords_dir = BOOK_PAGE_TFRECORDS_H if text_type == "v": book_page_tfrecords_dir = BOOK_PAGE_TFRECORDS_V check_or_makedirs(book_page_tfrecords_dir) # 我们可以把生成的图片直接存入tfrecords文件 # 而不必将生成的图片先保存到磁盘,再从磁盘读取出来保存到tfrecords文件,这样效率太低 writers_list = \ [tf.io.TFRecordWriter(os.path.join(book_page_tfrecords_dir, "book_pages_%d.tfrecords" % i)) for i in range(init_num, init_num+20)] # 保存生成的书页图片 _shape = page_shape for i in range(obj_num): writer = random.choice(writers_list) if page_shape is None and text_type == "h": _shape = (random.randint(480, 720), random.randint(640, 960)) if page_shape is None and text_type == "v": _shape = (random.randint(640, 960), random.randint(480, 720)) PIL_page, text_bbox_list, split_pos_list = create_book_page( _shape, text_type=text_type) bytes_image = PIL_page.tobytes() # 将图片转化为原生bytes text_boxes = np.array(text_bbox_list, dtype=np.int32).tobytes() split_positions = np.array(split_pos_list, dtype=np.int32).tobytes() example = tf.train.Example(features=tf.train.Features( feature={ 'bytes_image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_image])), 'img_height': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_page.height])), 'img_width': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_page.width])), 'text_boxes': tf.train.Feature(bytes_list=tf.train.BytesList( value=[text_boxes])), 'split_positions': tf.train.Feature(bytes_list=tf.train.BytesList( value=[split_positions])) })) writer.write(example.SerializeToString()) if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush() # 关闭所有的tfrecords写者 [writer.close() for writer in writers_list] return
def main(book_page_dir, dest_dir=None, is_mix_line=False, text_type="vertical", model_struc="densenet_gru"): if dest_dir is not None: check_or_makedirs(dest_dir) K.set_learning_phase(False) # 加载模型 segment_book_page_model = SegmentModel("book_page", text_type, model_struc, weights=99) segment_mix_line_model = SegmentModel("mix_line", text_type, model_struc, weights=65) segment_double_line_model = SegmentModel("double_line", text_type, model_struc, weights=72) segment_text_line_model = SegmentModel("text_line", text_type, model_struc, weights=42) recog_model = CharRecogModel(model_struc, weights=121) # 切分书页 np_page_list, page_name_list, page_split_pos_list, page_scores_list = segment_book_page_model.segment_predict( img_paths=book_page_dir) for i in range(len(np_page_list)): try: split_line_dict = { "page": [], "mix": [], "double": [], "single": [] } start_coord = np.array([0, 0], dtype=np.int32) np_line_list, page_split_lines = \ extract_slices(np_page_list[i], page_split_pos_list[i], start_coord, segment_task="book_page", text_type=text_type) split_line_dict["page"].append(page_split_lines) text_list = [] if text_type in ("v", "vertical") and is_mix_line: np_mix_line_list = np_line_list # 切分单双行 _, _, mix_split_pos_list, mix_scores_list = segment_mix_line_model.segment_predict( images=np_mix_line_list) for j in range(len(np_mix_line_list)): np_double_line_list, mix_split_lines = \ extract_slices(np_mix_line_list[j], mix_split_pos_list[j], page_split_lines[j+1, :2], segment_task="mix_line", text_type=text_type) split_line_dict["mix"].append(mix_split_lines) # 切分双行 _, _, double_split_pos_list, double_scores_list = segment_double_line_model.segment_predict( images=np_double_line_list) img_w = np_mix_line_list[j].shape[1] double_split_pos_list, double_scores_list = check_and_correct_double_split( double_split_pos_list, double_scores_list, img_w) text1, text2 = "", "" for k in range(len(np_double_line_list)): np_text_line_list, double_split_lines = \ extract_slices(np_double_line_list[k], double_split_pos_list[k], mix_split_lines[k, :2], segment_task="double_line", text_type=text_type) split_line_dict["double"].append(double_split_lines) # 切分单行(文本行) _, _, char_split_pos_list, char_scores_list = segment_text_line_model.segment_predict( images=np_text_line_list) assert len(np_text_line_list) in (1, 2) sub_text1, sub_text2 = "", "" for t in range(len(np_text_line_list)): np_char_list, single_split_lines = \ extract_slices(np_text_line_list[t], char_split_pos_list[t], double_split_lines[t+1, :2], segment_task="text_line", text_type=text_type) split_line_dict["single"].append( single_split_lines) # # 单字识别 # _, _, pred_topk_chars_list = recog_model.char_predict(images=np_char_list) # # # 识别结果 # text_str = "".join([chars[0] if len(chars) > 0 else "?" for chars in pred_topk_chars_list]) # if t == 0: # sub_text1 = text_str # else: # sub_text2 = text_str # # # 等长调整 # len_1, len_2 = len(sub_text1), len(sub_text2) # max_len = max(len_1, len_2) # text1 += sub_text1 + " " * (max_len - len_1) # text2 += sub_text2 + " " * (max_len - len_2) # # # 保存当前单双行文本 # text_list.extend([text1, text2, "\n"]) elif text_type in ("v", "vertical", "h", "horizontal") and not is_mix_line: np_text_line_list = np_line_list # 切分单行(文本行) _, _, char_split_pos_list, char_scores_list = segment_text_line_model.segment_predict( images=np_text_line_list) for t in range(len(np_text_line_list)): _t = t + 1 if text_type in ("v", "vertical") else t np_char_list, single_split_lines = \ extract_slices(np_text_line_list[t], char_split_pos_list[t], page_split_lines[_t, :2], segment_task="text_line", text_type=text_type) split_line_dict["single"].append(single_split_lines) # 单字识别 _, _, pred_topk_chars_list = recog_model.char_predict( images=np_char_list) # 识别结果 text_str = "".join([ chars[0] if len(chars) > 0 else "?" for chars in pred_topk_chars_list ]) text_list.extend([text_str, "\n"]) # 保存 else: ValueError( "Horizontal book page should not exist single-double text line." ) # save if dest_dir is not None: PIL_page_drawn = draw_split_lines( np_page=np_page_list[i], split_line_dict=split_line_dict) # draw page_name = os.path.splitext(page_name_list[i])[0] PIL_page_drawn.save(os.path.join(dest_dir, page_name + ".jpg"), format="jpeg") with open(os.path.join(dest_dir, page_name + ".txt"), "w", encoding="utf8") as fw: fw.write("\n".join(text_list)) # print print("\n*******************", page_name_list[i], "*******************\n") print("\n".join(text_list)) except: continue
def segment_predict(images=None, img_paths=None, dest_dir=None, segment_model=None, segment_task="book_page", text_type="horizontal", model_struc="densenet_gru", weights=""): # images if images is not None: np_img_list = convert_images(images) img_name_list = [str(i)+".jpg" for i in range(len(np_img_list))] else: assert img_paths is not None np_img_list, img_name_list = load_images(img_paths) # book page pre-processing if segment_task == "book_page": np_img_list = book_page_pre_processing(np_img_list) # model if segment_model is None: K.set_learning_phase(False) weights_path = model_weights_path(weights, segment_task, model_struc) # 加载模型 segment_model = work_net(stage="predict", segment_task=segment_task, text_type=text_type, model_struc=model_struc) segment_model.load_weights(weights_path, by_name=True) print("\nLoad model weights from %s\n" % weights_path) # segment_model.summary() # predict batch_size, fixed_h, feat_stride = get_segment_task_params(segment_task) text_type = text_type[0].lower() split_positions_list, scores_list = [], [] for i in range(0, len(np_img_list), batch_size): _images_list, _scale_ratio_list = [], [] for np_img in np_img_list[i:i+batch_size]: np_img, _, scale_ratio = adjust_img_to_fixed_height(np_img, None, fixed_h, segment_task, text_type) _images_list.append(np_img) _scale_ratio_list.append(scale_ratio) batch_images, real_images_width, _ = pack_a_batch(_images_list, None, feat_stride, background="white") nms_split_positions, nms_scores = segment_model.predict(x=[batch_images, real_images_width]) # 模型预测 for j in range(len(batch_images)): scores = remove_pad_np(nms_scores[j])[:, 0] split_positions = remove_pad_np(nms_split_positions[j]) split_positions = split_positions / _scale_ratio_list[j] if (segment_task, text_type) in (("book_page", "h"), ("double_line", "h"), ("text_line", "v"), ("mix_line", "v")): _, split_positions = restore_original_angle(np_img=None, pred_split_positions=split_positions) split_positions_list.append(split_positions) scores_list.append(scores) # draw if dest_dir is not None: check_or_makedirs(dest_dir) for i in range(len(np_img_list)): if (segment_task, text_type) in (("book_page", "h"), ("double_line", "h"), ("text_line", "v"), ("mix_line", "v")): np_img, split_positions = rotate_90_degrees(np_img_list[i], split_positions_list[i]) else: np_img, split_positions = np_img_list[i], split_positions_list[i] np_img = visualize.draw_split_lines(np_img, split_positions, scores_list[i]) # 可视化 if (segment_task, text_type) in (("book_page", "h"), ("double_line", "h"), ("text_line", "v"), ("mix_line", "v")): np_img, _ = restore_original_angle(np_img) PIL_img = Image.fromarray(np_img) dest_path = os.path.join(dest_dir, os.path.splitext(img_name_list[i])[0] + ".jpg") PIL_img.save(dest_path, format="jpeg") print(i, "Finished: " + dest_path) return np_img_list, img_name_list, split_positions_list, scores_list
def generate_one_text_line_tfrecords(obj_num=100, text_type="horizontal", init_num=0, text_shape=None, edges=False): text_type = check_text_type(text_type) if text_type == "h": text_line_tfrecords_dir = ONE_TEXT_LINE_TFRECORDS_H if text_type == "v": text_line_tfrecords_dir = ONE_TEXT_LINE_TFRECORDS_V check_or_makedirs(text_line_tfrecords_dir) # 可以把生成的图片直接存入tfrecords文件 # 而不必将生成的图片先保存到磁盘,再从磁盘读取出来保存到tfrecords文件,这样效率太低 writers_list = \ [tf.io.TFRecordWriter(os.path.join(text_line_tfrecords_dir, "text_lines_%d.tfrecords" % i)) for i in range(init_num, init_num+20)] # 保存生成的文本图片 _shape = text_shape for i in range(obj_num): writer = random.choice(writers_list) if text_shape is None and text_type == "h": _shape = (random.randint(38, 72), random.randint(540, 1280)) if text_shape is None and text_type == "v": _shape = (random.randint(540, 1280), random.randint(38, 72)) PIL_text, char_and_box_list, split_pos_list = create_one_text_line( _shape, text_type=text_type, edges=edges) bytes_image = PIL_text.tobytes() # 将图片转化为原生bytes bytes_chars = "".join([ chinese_char for chinese_char, gt_box in char_and_box_list ]).encode("utf-8") labels = np.array( [CHAR2ID_DICT[char] for char, gt_box in char_and_box_list], dtype=np.int32).tobytes() gt_boxes = np.array( [gt_box for chinese_char, gt_box in char_and_box_list], dtype=np.int32).tobytes() split_positions = np.array(split_pos_list, dtype=np.int32).tobytes() example = tf.train.Example(features=tf.train.Features( feature={ 'bytes_image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_image])), 'img_height': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_text.height])), 'img_width': tf.train.Feature(int64_list=tf.train.Int64List( value=[PIL_text.width])), 'bytes_chars': tf.train.Feature(bytes_list=tf.train.BytesList( value=[bytes_chars])), 'labels': tf.train.Feature(bytes_list=tf.train.BytesList( value=[labels])), 'gt_boxes': tf.train.Feature(bytes_list=tf.train.BytesList( value=[gt_boxes])), 'split_positions': tf.train.Feature(bytes_list=tf.train.BytesList( value=[split_positions])) })) writer.write(example.SerializeToString()) if i % 50 == 0: print("Process bar: %.2f%%" % (i * 100 / obj_num)) sys.stdout.flush() # 关闭所有的tfrecords写者 [writer.close() for writer in writers_list] return
img = img + noise img = tf.where(img < 0, 0, img) img = tf.where(img > 255, 255, img) img = tf.cast(img, tf.uint8) for i in range(100): print(i, img.dtype) # **************************** delta = -1 + i * 2 / 100 im = tf.image.adjust_brightness(img, delta=delta) print(im.dtype) np_im = im.numpy().astype(np.uint8) p_im = Image.fromarray(np_im) check_or_makedirs(os.path.join("..", "tf_image", "brightness")) im_path = os.path.join("..", "tf_image", "brightness", "delta_" + str(delta) + ".jpg") p_im.save(im_path, format="jpeg") # **************************** contrast_factor = 0.3 + i * 1.5 / 100 im = tf.image.adjust_contrast(img, contrast_factor=contrast_factor) print(im.dtype) np_im = im.numpy().astype(np.uint8) p_im = Image.fromarray(np_im) check_or_makedirs(os.path.join("..", "tf_image", "contrast")) im_path = os.path.join("..", "tf_image", "contrast", "contrast_factor_" + str(contrast_factor) + ".jpg") p_im.save(im_path, format="jpeg")