def scan(file): img = Image.open(file.stream) image = np.array(img) image = utils.img2gray(image) utils.save(image * 255, os.path.join(curr_dir, "test", "p0.png")) # image = utils.clearImgGray(image) # utils.save(image * 255, os.path.join(curr_dir,"test","p1.png")) split_images = utils.splitImg(image) ocr_texts = [] for i, split_image in enumerate(split_images): inv_image = utils.img2bwinv(split_image) inv_image = utils.clearImg(inv_image) image = 255. - split_image image = utils.dropZeroEdges(inv_image, image) image = utils.resize(image, ocr.image_height) image = image / 255. ocr_inputs = np.zeros([1, ocr.image_size, ocr.image_size]) ocr_inputs[0, :] = utils.square_img( image, np.zeros([ocr.image_size, ocr.image_size])) ocr_seq_len = np.ones(1) * (ocr.image_size * ocr.image_size) // ( ocr.POOL_SIZE * ocr.POOL_SIZE) start = time.time() p_net_g = session.run(net_g, {inputs: ocr_inputs}) p_net_g = np.squeeze(p_net_g, axis=3) debug_net_g = np.copy(p_net_g) for j in range(1): _t_img = utils.unsquare_img(p_net_g[j], ocr.image_height) _t_img_bin = np.copy(_t_img) _t_img_bin[_t_img_bin <= 0.2] = 0 _t_img = utils.dropZeroEdges(_t_img_bin, _t_img, min_rate=0.1) _t_img = utils.resize(_t_img, ocr.image_height) if _t_img.shape[0] * _t_img.shape[ 1] <= ocr.image_size * ocr.image_size: p_net_g[j] = utils.square_img( _t_img, np.zeros([ocr.image_size, ocr.image_size]), ocr.image_height) _img = np.vstack((ocr_inputs[0], debug_net_g[0], p_net_g[0])) utils.save(_img * 255, os.path.join(curr_dir, "test", "%s.png" % i)) decoded_list = session.run(res_decoded[0], { inputs: p_net_g, seq_len: ocr_seq_len }) seconds = round(time.time() - start, 2) print("filished ocr %s , paid %s seconds" % (i, seconds)) detected_list = utils.decode_sparse_tensor(decoded_list) for detect_number in detected_list: ocr_texts.append(ocr.list_to_chars(detect_number)) return ocr_texts
def scan(file): img_array = np.asarray(bytearray(file.stream.read()), dtype=np.uint8) image = cv2.imdecode(img_array,0) split_images = utils.splitImg(image) ocr_texts = [] for i, split_image in enumerate(split_images): image =utils.img2bwinv(split_image) image = utils.dropZeroEdges(image) image = utils.resize(image, ocr.image_height) utils.save(image,os.path.join(curr_dir,"test","%s.png"%i)) maxImageWidth = image.shape[1]+5 image_vec = utils.img2vec(image,ocr.image_height,maxImageWidth) ocr_inputs = np.zeros([1, maxImageWidth, ocr.image_height]) ocr_inputs[0,:] = np.transpose(image_vec.reshape((ocr.image_height,maxImageWidth))) ocr_seq_len = np.ones(ocr_inputs.shape[0]) * maxImageWidth feed = {inputs: ocr_inputs, seq_len: ocr_seq_len, input_keep_prob: 1.0} start = time.time() decoded_list = session.run(decoded[0], feed) seconds = round(time.time() - start,2) print("filished ocr %s , paid %s seconds" % (i,seconds)) detected_list = ocr.decode_sparse_tensor(decoded_list) for detect_number in detected_list: ocr_texts.append(ocr.list_to_chars(detect_number)) return ocr_texts
def scan(file): img = Image.open(file.stream) image = np.array(img) image = utils.img2gray(image) image = utils.clearImgGray(image) utils.save(image, os.path.join(curr_dir,"test","src.png")) split_images = utils.splitImg(image) ocr_texts = [] for i, split_image in enumerate(split_images): # image = utils.img2bwinv(split_image) image = utils.clearImgGray(split_image) # image = utils.clearBackgroundColor(image, 255) image = 255. - image image = utils.dropZeroEdges(image) image = utils.resize(image, ocr.image_height) utils.save(image,os.path.join(curr_dir,"test","%s.png"%i)) image = image / 255. maxImageWidth = image.shape[1] maxImageWidth = maxImageWidth + (ocr.POOL_SIZE - maxImageWidth % ocr.POOL_SIZE) image_vec = utils.img2vec(image,ocr.image_height,maxImageWidth) ocr_inputs = np.zeros([1, maxImageWidth, ocr.image_height]) ocr_inputs[0,:] = np.transpose(image_vec.reshape((ocr.image_height,maxImageWidth))) ocr_seq_len = np.ones(ocr_inputs.shape[0]) * (maxImageWidth * ocr.image_height) // (ocr.POOL_SIZE * ocr.POOL_SIZE) feed = {inputs: ocr_inputs, seq_len: ocr_seq_len, input_keep_prob: 1.0} start = time.time() decoded_list = session.run(decoded[0], feed) seconds = round(time.time() - start,2) print("filished ocr %s , paid %s seconds" % (i,seconds)) detected_list = ocr.decode_sparse_tensor(decoded_list) for detect_number in detected_list: ocr_texts.append(ocr.list_to_chars(detect_number)) return ocr_texts
def get_next_batch(batch_size=128): inputs = np.zeros([batch_size, image_size[1] * image_size[0]]) labels = np.zeros([batch_size, label_size], dtype=int) batch = random.sample(train_files, batch_size) for i, line in enumerate(batch): lines = line.split(" ") imageFileName = lines[0] + ".png" text = line[line.index(' '):].strip() # 在宋体9号字体下,O和0完全一致,因此全部按0处理 # text = text.replace('O','0') # 文本需要补齐空格 text = text + "".join([' ' for x in range(label_size - len(text))]) if imageFileName in images: imgvec = images[imageFileName] else: # 输出图片为反色黑白 image = readImgFile(os.path.join(curr_dir, DATA_DIR, imageFileName)) image = img2bwinv(image) image = dropZeroEdges(image) image = resize(image, image_size[0]) imgvec = img2vec(image, image_size[0], image_size[1]) # images[imageFileName] = imgvec inputs[i, :] = imgvec label_list = [] for c in text: if c in CHARS: label_list.append(CHARS.index(c)) else: label_list.append(CHARS.index(UNKOWN_CHAR)) labels[i, :] = label_list return inputs, labels
def getImage(text, font_name, font_length, font_size, noise=False, fontmode=None, fonthint=None): params= {} params['text'] = text params['fontname'] = font_name params['fontsize'] = font_size # params['fontmode'] = random.choice([0,1,2,4,8]) if fontmode == None: params['fontmode'] = random.choice([0,1,2,4]) else: params['fontmode'] = fontmode if fonthint == None: params['fonthint'] = random.choice([0,1,2,3,4,5]) else: params['fonthint'] = fonthint r = http('http://192.168.2.113:8888/',params) _img = Image.open(io.BytesIO(r)) img=Image.new("RGB",_img.size,(255,255,255)) img.paste(_img,(0,0),_img) img = utils.trim(img) if noise: w,h = img.size _h = random.randint(9, image_height) _w = round(w * _h / h) img = img.resize((_w,_h), Image.ANTIALIAS) img = np.asarray(img) img = 1 - utils.img2gray(img)/255. img = utils.dropZeroEdges(img) filter = np.random.random(img.shape) - 0.9 filter = np.maximum(filter, 0) img = img + filter * 5 imin, imax = img.min(), img.max() img = (img - imin)/(imax - imin) else: img = np.asarray(img) img = utils.img2gray(img) img = utils.img2bwinv(img) img = img / 255. img = utils.dropZeroEdges(img) return img
def getImage(CHARS, font_name, image_height, font_length, font_size, word_dict): text = '' n = random.random() if n < 0.1: for i in range(font_length): text += random.choice( "123456789012345678901234567890-./$,:()+-*=><") elif n < 0.5 and n >= 0.1: for i in range(font_length): text += random.choice(CHARS) else: while len(text) < font_length: word = random.choice(word_dict) _word = "" for c in word: if c in CHARS: _word += c text = text + " " + _word.strip() text = text.strip() params = {} params['text'] = text params['fontname'] = font_name params['fontsize'] = font_size # params['fontmode'] = random.choice([0,1,2,4,8]) params['fontmode'] = random.choice([0, 1, 2, 4]) params['fonthint'] = random.choice([0, 1, 2, 3, 4, 5]) r = http('http://192.168.2.113:8888/', params) _img = Image.open(io.BytesIO(r)) img = Image.new("RGB", _img.size, (255, 255, 255)) img.paste(_img, (0, 0), _img) img = utils.trim(img) w, h = img.size _h = random.randint(9, 64) _w = round(w * _h / h) img = img.resize((_w, _h), Image.ANTIALIAS) img = np.asarray(img) img = utils.clearBackgroundColor(img) img = 1 - utils.img2gray(img) / 255. img = utils.dropZeroEdges(img) filter = np.random.random(img.shape) - 0.9 filter = np.maximum(filter, 0) img = img + filter * 5 imin, imax = img.min(), img.max() img = (img - imin) / (imax - imin) img = utils.resize(img, image_height) return text, img
def scan2(): session, inputs, seq_len, input_keep_prob, decoded, log_prob = init() need_ocr_images = utils.loadImage(os.path.join(curr_dir, 'test', '0.jpg'), 0) ocr_text_groups = [] for idx, images_group in enumerate(need_ocr_images): # if idx != 1: continue ocr_texts = [] for i, image in enumerate(images_group): image = utils.dropZeroEdges(image) image = utils.resize(image, ocr.image_height) utils.save(image, os.path.join(curr_dir, "test", "%s-%s.png" % (idx, i))) maxImageWidth = image.shape[1] + 5 image_vec = utils.img2vec(image, ocr.image_height, maxImageWidth) ocr_inputs = np.zeros([1, maxImageWidth, ocr.image_height]) ocr_inputs[0, :] = np.transpose( image_vec.reshape((ocr.image_height, maxImageWidth))) ocr_seq_len = np.ones(ocr_inputs.shape[0]) * maxImageWidth feed = { inputs: ocr_inputs, seq_len: ocr_seq_len, input_keep_prob: 1.0 } print("starting ocr inputs %s:%s ..." % (idx, i)) start = time.time() decoded_list = session.run(decoded[0], feed) seconds = round(time.time() - start, 2) print("filished ocr inputs %s, paid %s seconds" % (idx, seconds)) detected_list = ocr.decode_sparse_tensor(decoded_list) for detect_number in detected_list: ocr_texts.append(ocr.list_to_chars(detect_number)) ocr_text_groups.append(ocr_texts) # break return ocr_text_groups
def getImage(CHARS, font_name, image_height, font_length, font_size, word_dict): text = utils_font.get_random_text(CHARS, word_dict, font_length) img = utils_font.get_font_image_from_url(text, font_name, font_size) img = utils_font.add_noise(img) img = utils_pil.convert_to_gray(img) w, h = img.size _h = random.randint(9, image_height) _w = round(w * _h / h) img = img.resize((_w, _h), Image.ANTIALIAS) img = np.asarray(img) # img = utils.clearBackgroundColor(img) img = 1 - img / 255. img = utils.dropZeroEdges(img) filter = np.random.random(img.shape) - 0.9 filter = np.maximum(filter, 0) img = img + filter * 5 imin, imax = img.min(), img.max() img = (img - imin) / (imax - imin) img = utils.resize(img, image_height) return text, img
with open(os.path.join(curr_dir, "data", "index.txt")) as index_file: for i, line in enumerate(index_file.readlines()): if i%10000==0: print("resizing image no: ",i) lines = line.split(" ") image_name = lines[0]+".png" dst_image_name = os.path.join(curr_dir, "data" ,"dataset", image_name) if os.path.exists(dst_image_name): train_files.append(line) continue if not os.path.exists(os.path.dirname(dst_image_name)): os.mkdir(os.path.dirname(dst_image_name)) src_image_name = os.path.join(curr_dir,"data",image_name) try: image = readImgFile(src_image_name) image = img2bwinv(image) image = dropZeroEdges(image) except: print(dst_image_name,"error") continue resized_image = resize(image,image_height) save(resized_image,dst_image_name) train_files.append(line) def neural_networks(): # 输入:训练的数量,一张图片的宽度,一张图片的高度 [-1,-1,16] inputs = tf.placeholder(tf.float32, [None, None, image_height], name="inputs") # 定义 ctc_loss 是稀疏矩阵 labels = tf.sparse_placeholder(tf.int32, name="labels") # 1维向量 size [batch_size] 等于 np.ones(batch_size)* image_width seq_len = tf.placeholder(tf.int32, [None], name="seq_len")
def train(): inputs, labels, global_step, \ res_loss, res_optim, seq_len, res_acc, res_decoded, \ net_g = neural_networks() curr_dir = os.path.dirname(__file__) model_dir = os.path.join(curr_dir, MODEL_SAVE_NAME) if not os.path.exists(model_dir): os.mkdir(model_dir) model_G_dir = os.path.join(model_dir, "TG") model_R_dir = os.path.join(model_dir, "R16") if not os.path.exists(model_R_dir): os.mkdir(model_R_dir) if not os.path.exists(model_G_dir): os.mkdir(model_G_dir) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) r_saver = tf.train.Saver(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='RES'), sharded=True) g_saver = tf.train.Saver(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='TRIM_G'), sharded=False) ckpt = tf.train.get_checkpoint_state(model_G_dir) if ckpt and ckpt.model_checkpoint_path: print("Restore Model G...") g_saver.restore(session, ckpt.model_checkpoint_path) ckpt = tf.train.get_checkpoint_state(model_R_dir) if ckpt and ckpt.model_checkpoint_path: print("Restore Model R...") r_saver.restore(session, ckpt.model_checkpoint_path) AllLosts = {} while True: errA = errD1 = errD2 = 1 batch_size = 4 for batch in range(BATCHES): if len(AllLosts) > 10 and random.random() > 0.7: sorted_font = sorted(AllLosts.items(), key=operator.itemgetter(1), reverse=True) font_info = sorted_font[random.randint(0, 10)] font_info = font_info[0].split(",") train_inputs, train_labels, train_seq_len, train_info = get_next_batch_for_res(batch_size, False, \ font_info[0], int(font_info[1]), int(font_info[2]), int(font_info[3])) else: # train_inputs, train_labels, train_seq_len, train_info = get_next_batch_for_res(batch_size, False, _font_size=36) train_inputs, train_labels, train_seq_len, train_info = get_next_batch_for_res( batch_size) # feed = {inputs: train_inputs, labels: train_labels, seq_len: train_seq_len} start = time.time() p_net_g = session.run(net_g, {inputs: train_inputs}) p_net_g = np.squeeze(p_net_g, axis=3) for i in range(batch_size): _t_img = utils.unsquare_img(p_net_g[i], image_height) _t_img = utils.cvTrimImage(_t_img) _t_img[_t_img < 0] = 0 _t_img = utils.resize(_t_img, image_height) if _t_img.shape[0] * _t_img.shape[ 1] <= image_size * image_size: p_net_g[i] = utils.square_img( _t_img, np.zeros([image_size, image_size]), image_height) feed = { inputs: p_net_g, labels: train_labels, seq_len: train_seq_len } errR, acc, _, steps = session.run( [res_loss, res_acc, res_optim, global_step], feed) font_info = train_info[0][0] + "/" + train_info[0][ 1] + " " + train_info[1][0] + "/" + train_info[1][1] print( "%d time: %4.4fs, res_acc: %.4f, res_loss: %.4f, info: %s " % (steps, time.time() - start, acc, errR, font_info)) if np.isnan(errR) or np.isinf(errR): print("Error: cost is nan or inf") return # 如果正确率低于90%,保存出来 if acc < 0.9: for i in range(batch_size): _img = np.vstack( (train_inputs[i] * 255, p_net_g[i] * 255)) cv2.imwrite( os.path.join(curr_dir, "test", "E%s_%s_%s.png" % (acc, steps, i)), _img) for info in train_info: key = ",".join(info) if key in AllLosts: AllLosts[key] = AllLosts[key] * 0.95 + errR * 0.05 else: AllLosts[key] = errR # 报告 if steps > 0 and steps % REPORT_STEPS == 0: train_inputs, train_labels, train_seq_len, train_info = get_next_batch_for_res( batch_size) p_net_g = session.run(net_g, {inputs: train_inputs}) p_net_g = np.squeeze(p_net_g, axis=3) for i in range(batch_size): _t_img = utils.unsquare_img(p_net_g[i], image_height) _t_img_bin = np.copy(_t_img) _t_img_bin[_t_img_bin <= 0.3] = 0 _t_img = utils.dropZeroEdges(_t_img_bin, _t_img, min_rate=0.1) _t_img = utils.resize(_t_img, image_height) if _t_img.shape[0] * _t_img.shape[ 1] <= image_size * image_size: p_net_g[i] = utils.square_img( _t_img, np.zeros([image_size, image_size]), image_height) decoded_list = session.run(res_decoded[0], { inputs: p_net_g, seq_len: train_seq_len }) for i in range(batch_size): _img = np.vstack((train_inputs[i], p_net_g[i])) cv2.imwrite( os.path.join(curr_dir, "test", "%s_%s.png" % (steps, i)), _img * 255) original_list = utils.decode_sparse_tensor(train_labels) detected_list = utils.decode_sparse_tensor(decoded_list) if len(original_list) != len(detected_list): print("len(original_list)", len(original_list), "len(detected_list)", len(detected_list), " test and detect length desn't match") print("T/F: original(length) <-------> detectcted(length)") acc = 0. for idx in range( min(len(original_list), len(detected_list))): number = original_list[idx] detect_number = detected_list[idx] hit = (number == detect_number) print("%6s" % hit, list_to_chars(number), "(", len(number), ")") print("%6s" % "", list_to_chars(detect_number), "(", len(detect_number), ")") # 计算莱文斯坦比 import Levenshtein acc += Levenshtein.ratio(list_to_chars(number), list_to_chars(detect_number)) print("Test Accuracy:", acc / len(original_list)) sorted_fonts = sorted(AllLosts.items(), key=operator.itemgetter(1), reverse=True) for f in sorted_fonts[:20]: print(f) print("Save Model R ...") r_saver.save(session, os.path.join(model_R_dir, "R.ckpt"), global_step=steps) try: ckpt = tf.train.get_checkpoint_state(model_G_dir) if ckpt and ckpt.model_checkpoint_path: print("Restore Model G...") g_saver.restore(session, ckpt.model_checkpoint_path) except: pass