def extract(image): try: binary = ocrolib.read_image_binary(image) binary = 1-binary scale = psegutils.estimate_scale(binary) segmentation = compute_segmentation(binary,scale) # ...lines = compute_lines(segmentation,scale) # compute the reading order lines = psegutils.compute_lines(segmentation,scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = amax(compute_segmentation)+1 renumber = zeros(nlabels,'i') for i,v in enumerate(lsort): renumber[lines[v].label] = 0x010000+(i+1) segmentation = renumber[segmentation] outputdir = "http://127.0.0.1:5000/uploads/" lines = [lines[i] for i in lsort] ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation) cleaned = ocrolib.remove_noise(binary,args.noise) for i,l in enumerate(lines): binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand) ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline) #print "%6d"%i,fname,"%4.1f"%scale,len(lines) except: print ('error')
def caption_segment(binary): ''' :param gray:待分析的析"标题栏" :param bina: :return: ''' # 排除边界处干扰部分 bina = ocrolib.remove_noise(binary, 8) scale = psegutils.estimate_scale(bina) lines = morph.select_regions(bina, sl.dim1, min=2 * scale) bina = bina - lines bina = morph.select_regions(bina, sl.dim0, min=scale / 3) #扩大文本区域,连接相邻文本 textlines = filters.maximum_filter(bina, (scale, scale / 2)) #计算候选文本区域起始位置 indexs_white = compute_index(textlines, th=scale / 2, n=1) indexs_lists = [] if len(indexs_white) > 2: index_fir = indexs_white[0] #排除过小同时连接相邻的候选文本区域 for i, index in enumerate(indexs_white): if index[1] - index[0] > scale / 2: #排除过小 if i != 0 and index[0] - index_fir[1] < scale / 3: #连接相近 index_acc = [index_fir[0], index[1]] indexs_lists.remove(index_fir) indexs_lists.append(index_acc) index_fir = index_acc else: indexs_lists.append(index) index_fir = index return indexs_lists
def calc(self, objects, scale): if self.binpage is None: return tt = time() bottom, top, boxmap = compute_gradmaps(self.binpage, scale) # DSHOW('hihi', [0.5*bottom+0.5*top,self.binpage, boxmap]) seeds0 = compute_line_seeds(self.binpage, bottom, top, scale) seeds, _ = morph.label(seeds0) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = spread_labels(seeds, maxdist=scale) llabels = where(llabels > 0, llabels, spread * self.binpage) segmentation = llabels * self.binpage self.binpage = ocrolib.remove_noise(self.binpage, args.noise) lines = psegutils.compute_lines(segmentation, scale) binpage_reversed = 1 - self.binpage # print 'pre line ', time() - tt tt = time() self.lines = [] for i, l in enumerate(lines): tt = time() binline = psegutils.extract_masked( binpage_reversed, l, pad=args.pad, expand=args.expand) # black text binline = (1 - binline) le = lineest.CenterNormalizer(binline.shape[0]) # white text binline = binline.astype(float) le.measure(binline) binline = le.normalize(binline) binline = where(binline > 0.5, 0, 1) # black text # print 'line time ', time()-tt print '-----------------------' pilimg = Image.fromarray((binline * 255).astype(uint8)) pred_legacy = pytesseract.image_to_string(pilimg, lang='eng', config='--oem 0 --psm 7') print '00', pred_legacy pred_lstm = pytesseract.image_to_string(pilimg, lang='eng', config='--oem 1 --psm 7') print '11', pred_lstm # ASHOW('line',binline, scale=2.0) ## pred_both = pytesseract.image_to_string(pilimg,lang='vie', config='--oem 2 --psm 7') ## print '22', pred_both result = psegutils.record(bounds=l.bounds, text1=pred_legacy, text2=pred_lstm, img=binline) self.lines.append(result)
def printResult(self, outputfile): # Some pre-process # print 'text area before' # cv2.imshow('patch', self.patch) # cv2.waitKey(-1) if self.name == 'CMND cu - 9 so': k = 0.45 else: k = 0.33 patch = sharpen(self.patch) binary = sauvola(patch, w=int(self.template.shape[1] / 24.5 * 2), k=k, scaledown=0.5, reverse=True) binary = cv2.bitwise_and(binary, binary, mask=self.patch_mask) # print 'text area after' # cv2.imshow('patch', binary*255) # cv2.waitKey(-1) dotremoved = binary scale = self.scale # Line extraction copied from Ocropus source code bottom, top, boxmap = compute_gradmaps(dotremoved, scale) seeds0 = compute_line_seeds(dotremoved, bottom, top, scale) seeds, _ = morph.label(seeds0) llabels = morph.propagate_labels(boxmap, seeds, conflict=0) spread = spread_labels(seeds, maxdist=scale) llabels = where(llabels > 0, llabels, spread * dotremoved) segmentation = llabels * dotremoved dotremoved = ocrolib.remove_noise(dotremoved, 8) lines = psegutils.compute_lines(segmentation, scale / 2) binpage_reversed = 1 - dotremoved self.lines = [] readrs = dict.fromkeys(self.linepos1.keys(), u'') lines = sorted(lines, key=lambda x: x.bounds[1].start) for i, l in enumerate(lines): # Line extraction copied from Ocropus source code binline = psegutils.extract_masked(binpage_reversed, l, pad=int(scale / 2), expand=0) # black text binline = (1 - binline) le = lineest.CenterNormalizer(binline.shape[0]) # white text binline = binline.astype(float) le.measure(binline) binline = le.normalize(binline) # print 'normalized' # cv2.imshow('line', binline) # cv2.waitKey(-1) binline = cv2.resize(binline, None, fx=2.0, fy=2.0) # print 'resized' # cv2.imshow('line', binline) # cv2.waitKey(-1) binline = where(binline > 0.5, uint8(0), uint8(255)) # black text # print 'black text' # cv2.imshow('line', binline) # cv2.waitKey(-1) # pilimg = Image.fromarray(binline) pos = l.bounds[0].stop left = (l.bounds[1].start < self.template.shape[1] / 2) # Prediction using Tesseract 4.0 if pos > self.linepos1['idNumber'][0] and pos < self.linepos1[ 'idNumber'][1]: #ID, all numbers pred = ocr( binline, config= '--oem 0 --psm 7 -c tessedit_char_whitelist=0123456789') readrs['idNumber'] += pred + u' ' elif pos > self.linepos1['dateOfBirth'][0] and pos < self.linepos1[ 'dateOfBirth'][1]: # DOB, number, - , / pred = ocr( binline, config= '--oem 1 --psm 7 -c tessedit_char_whitelist=0123456789-/') readrs['dateOfBirth'] += pred + u' ' elif left and pos > self.linepos1['Gender'][ 0] and pos < self.linepos1['Gender'][1]: pred = ocr(binline, config='--oem 1 --psm 7 -l vie') readrs['Gender'] += pred + u' ' elif (not left) and pos > self.linepos1['Dantoc'][ 0] and pos < self.linepos1['Dantoc'][1]: pred = ocr(binline, config='--oem 1 --psm 7 -l vie') readrs['Dantoc'] += pred + u' ' elif pos > self.linepos1['NguyenQuan'][0] and pos < self.linepos1[ 'NguyenQuan'][1]: pred = ocr(binline, config='--oem 1 --psm 7 -l vie') readrs['NguyenQuan'] += pred + u' ' elif pos > self.linepos1['fullName'][0] and pos < self.linepos1[ 'fullName'][1]: pred = ocr(binline, config='--oem 1 --psm 7 -l vie') readrs['fullName'] += pred + u' ' # else: # pred = ocr(binline, config='--oem 1 --psm 7 -l vie') # print 'unknown ', unicode2ascii(pred), 'y:', l.bounds[0], 'x:', l.bounds[1] for k in readrs: readrs[k] = (readrs[k].replace(u'²', u'2').replace(u'º', u'o').replace( u'»', u'-')).strip() if len(readrs[k]) == 0: readrs[k] = None if self.name == 'CMND moi - 12 so': readrs['type'] = 'CMND Mới - 12 Số' elif self.name == 'Can Cuoc Cong Dan': readrs['type'] = 'Căn Cước Công Dân' elif self.name == 'CMND cu - 9 so': readrs['type'] = 'CMND Cũ - 9 Số' readrs['NgayHetHan'] = None outputfile.write(json.dumps(readrs))
def _process_segment(self, page_image, page, textregion, region_xywh, page_id, input_file, n): LOG = getLogger('OcrdAnybaseocrTextline') #check for existing text lines and whether to overwrite them if textregion.get_TextLine(): if self.parameter['overwrite']: LOG.info('removing existing TextLines in region "%s"', page_id) textregion.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', page_id) return binary = ocrolib.pil2array(page_image) if len(binary.shape) > 2: binary = np.mean(binary, 2) binary = np.array(1 - binary / np.amax(binary), 'B') if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning(str(scale) + ": bad scale; skipping!\n") return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("too many lines %i; skipping!\n", (np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) for i, l in enumerate(lines): #LOG.info('check this: ') #LOG.info(type(l.bounds)) #LOG.info(l.bounds) #line_points = np.where(l.mask==1) #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull #x,y = hull.exterior.coords.xy #LOG.info('hull coords x: ',x) #LOG.info('hull coords y: ',y) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] #line_polygon = [x for x in zip(y, x)] line_polygon = coordinates_for_segment(line_polygon, page_image, region_xywh) line_points = points_from_polygon(line_polygon) img = cleaned[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = 255 - img img = ocrolib.array2pil(img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( img, file_id + "_" + str(n) + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line_id = '%s_line%04d' % (page_id, i) line = TextLineType(custom='readingOrder {index:' + str(i) + ';}', id=line_id, Coords=CoordsType(line_points)) line.add_AlternativeImage(ai) textregion.add_TextLine(line)
def analyze_page_layout(binary, gray, rgb=None): hscale = 1.0 # Non-standard scaling of horizontal parameters. vscale = 1.0 # Non-standard scaling of vertical parameters. threshold = 0.2 # baseline threshold. usegauss = True # Use gaussian instead of uniform. maxseps = 0 # Maximum black column separators. sepwiden = 10 # Widen black separators (to account for warping). blackseps = True maxcolseps = 3 # Maximum # whitespace column separators. csminheight = 10 # Minimum column height (units=scale). noise = 8 # Noise threshold for removing small components from lines. gray_output = True # Output grayscale lines as well, which are extracted from the grayscale version of the pages. pad = 3 # Padding for extracted lines. expand = 3 # Expand mask for grayscale extraction. if False: bin_image_filepath = './ocropy_test.bin.png' gray_image_filepath = './ocropy_test.nrm.png' binary = ocrolib.read_image_binary(bin_image_filepath) gray = ocrolib.read_image_gray(gray_image_filepath) binary = 1 - binary # Invert. scale = psegutils.estimate_scale(binary) segmentation = compute_segmentation(binary, scale, blackseps, maxseps, maxcolseps, csminheight, sepwiden, usegauss, vscale, hscale, threshold, quiet=True) lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # Renumber the labels so that they conform to the specs. nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] # Image. lines = [lines[i] for i in lsort] # Visualize bounding boxes. if False: if rgb is not None: # REF [function] >> extract_masked() in ${OCROPY_HOME}/ocrolib/psegutils.py. for l in lines: y0, x0, y1, x1 = [ int(x) for x in [ l.bounds[0].start, l.bounds[1].start, l.bounds[0].stop, l.bounds[1].stop ] ] cv2.rectangle(rgb, (x0, y0), (x1, y1), (0, 0, 255), 1, cv2.LINE_AA) cv2.imshow('Image', rgb) cv2.waitKey(0) # Output everything. if False: if not os.path.exists(outputdir): os.mkdir(outputdir) ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation) cleaned = ocrolib.remove_noise(binary, noise) for i, l in enumerate(lines): binline = psegutils.extract_masked(1 - cleaned, l, pad=pad, expand=expand) # Image. ocrolib.write_image_binary( "%s/01%04x.bin.png" % (outputdir, i + 1), binline) if gray_output: grayline = psegutils.extract_masked(gray, l, pad=pad, expand=expand) # Image. ocrolib.write_image_gray( "%s/01%04x.nrm.png" % (outputdir, i + 1), grayline)
def processPngFile(outRoot, origFile, fileNum): baseName = os.path.basename(origFile) baseBase, _ = os.path.splitext(baseName) outDir = os.path.join(outRoot, "%s.%03d" % (baseBase, fileNum)) inFile = os.path.join(outDir, baseName) os.makedirs(outDir, exist_ok=True) shutil.copy(origFile, inFile) inBase, _ = ocrolib.allsplitext(inFile) print("** inBase=%s" % inBase) # print("** binBase=%s" % binBase) fname = inFile outputdir = inBase binFile = inBase + ".bin.png" outFile = inBase + ".out.png" outRoot2, outDir2 = os.path.split(outRoot) outFile2 = os.path.join(outRoot2, "%s.out" % outDir2, baseName) print("outFile2=%s" % outFile2) # assert False grayFile = inBase + ".nrm.png" psegFile = inBase + ".pseg.png" print(" inFile=%s" % inFile) print(" binFile=%s" % binFile) print("grayFile=%s" % grayFile) print(" outFile=%s" % outFile) assert inFile and binFile assert outFile != inFile assert outFile != binFile if not binarize(inFile, binFile, grayFile): binExists = os.path.exists(binFile) print("Couldn't binarize inFile=%s binFile=%s exists=%s" % (inFile, binFile, binExists)) return False binary = ocrolib.read_image_binary(binFile) print("$$ %s=%s" % (binFile, desc(binary))) height, width = binary.shape checktype(binary, ABINARY2) check = check_page(np.amax(binary) - binary) if check is not None: print("%s SKIPPED %s (use -n to disable this check)" % (inFile, check)) return False # if args.gray: # if os.path.exists(base+".nrm.png"): # gray = ocrolib.read_image_gray(base+".nrm.png") # checktype(gray, GRAYSCALE) # else: # print_error("Grayscale version %s.nrm.png not found. Use ocropus-nlbin for creating " + # "normalized grayscale version of the pages as well." % base) # return binary = 1 - binary # invert scale = psegutils.estimate_scale(binary) print("scale %f" % scale) if np.isnan(scale) or scale > 1000.0: print("%s: bad scale (%g); skipping\n" % (fname, scale)) return False # find columns and text lines print("computing segmentation") segmentation = compute_segmentation(binary, scale) if np.amax(segmentation) > maxlines: print("%s: too many lines %g" % (fname, np.amax(segmentation))) return False print("segmentation=%s" % desc(segmentation)) print("number of lines %g" % np.amax(segmentation)) # compute the reading order print("finding reading order") lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) print("$$ lsort = %d = %s...%s" % (len(lsort), lsort[:10], lsort[-10:])) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] # finally, output everything print("writing lines") if not os.path.exists(outputdir): os.mkdir(outputdir) lines = [lines[i] for i in lsort] ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation) cleaned = ocrolib.remove_noise(binary, noise) for i, l in enumerate(lines): binline = psegutils.extract_masked(1 - cleaned, l, pad=pad, expand=expand) ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1), binline) # if args.gray: # grayline = psegutils.extract_masked( # gray, l, pad=args.pad, expand=args.expand) # ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i+1), grayline) print("%6d %s %4.1f %d" % (i, fname, scale, len(lines))) # to proceed, we need a pseg file and a subdirectory containing text lines assert os.path.exists(psegFile), "%s: no such file" % psegFile assert os.path.isdir(inBase), "%s: no such directory" % inBase # iterate through the text lines in reading order, based on the page segmentation file pseg = ocrolib.read_page_segmentation(psegFile) print("$$ %s=%s" % (psegFile, desc(pseg))) regions = ocrolib.RegionExtractor() print("$$ regions=%s" % regions) regions.setPageLines(pseg) im = Image.open(inFile) print("~~%s %s" % (inFile, im.size)) print("$$ regions=%s=%s" % (regions, sorted(regions.__dict__))) print("$$ regions.length=%s" % regions.length()) n = regions.length() for i in range(1, n): id = regions.id(i) y0, x0, y1, x1 = regions.bbox(i) # print("%5d: 0x%05X %s %d x %d" % # (i, id, [y0, x0, y1, x1], y1 - y0, x1 - x0)) draw = ImageDraw.Draw(im) draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=3) draw.rectangle((x0, y0, x1, y1), outline=(0, 0, 255), width=0) # draw.rectangle((x0, y0, x1, y1), outline=255, width=5) # draw.rectangle((x0, y0, x1, y1), outline=10, width=1) del draw # write output files print("outFile=%s" % outFile) im.save(outFile, "PNG") print("outFile2=%s" % outFile2) outDir2 = os.path.dirname(outFile2) os.makedirs(outDir2, exist_ok=True) im.save(outFile2, "PNG") assert os.path.exists(outFile2) # outFile3, _ = os.path.splitext(outFile) # outFile3 = "%s.jpg" % outFile3 # print("outFile3=%s" % outFile3) # im.save(outFile3, "JPEG") # assert os.path.exists(outFile3) return True
def Segment(fname, save_path): # pdb.set_trace() # 清理上次执行的缓存结果 if os.path.exists(save_path): shutil.rmtree(save_path) os.makedirs(save_path) #读取图像数据 raw = read_image_gray(fname) #二值化,抗旋转,抗明暗度变化 gray_o, bina_o = Binarization(raw) #出现类型错误,返回分割失败标识0 if gray_o is None and bina_o is None: new_fname = os.path.basename(fname) cv2.imwrite(os.path.join(save_path, new_fname)) return 0 #估计文本宽度 bina_o = ocrolib.remove_noise(bina_o, 8) scale = psegutils.estimate_scale(bina_o) #页面分块 block_grays, block_binas = split_columns_vertical(gray_o, bina_o, scale) if len(block_grays) > 2: # 图片格式出现特殊情况,即指标栏之间均以垂直黑线分隔开,直接进行行分割 mb_dics = {} # 存储删除空白行记录后的所有文件记录 mb_block = [] # 记录每块中的行列数 for i, gray_i in enumerate(block_grays): mb_dic = {} bina_i = block_binas[i] #分离属性栏区域和核心指标栏 cp_gray, cp_bina, mb_gray, mb_bina = get_caption_mainbody( gray_i, bina_i, scale) #行分割,并将结果分割结果及存储名称以字典形式存放 mb_dic, max_row = mainbody_textline_segment( mb_gray, mb_bina, scale, i, 0, mb_dic) max_col = 0 max_row += 1 #表单结构化初步调整:调整空白栏 mb_dics = modify_fname_dictionary(mb_dic, mb_dics) mb_block.append([i, max_col, max_row]) #表单结构化后处理:合并多块 res_mb_dics = modify_mainbody_display(mb_dics, mb_block) #去除分块标识 res_mb_dics = add_flag(res_mb_dics) else: # 正常的格式,即单块或者两块 mb_dics = {} # 存储删除空白行记录后的所有文件记录 mb_block = [] # 记录每块中的行列数 for i, gray_i in enumerate(block_grays): bina_i = block_binas[i] #分离属性栏区域和核心指标栏区域 cp_gray, cp_bina, mb_gray, mb_bina = get_caption_mainbody( gray_i, bina_i, scale) #属性栏列方向分割,获得各属性分割位置,以列表形式存放 cp_index_list = [] if cp_bina is not None: cp_index_list = caption_segment(cp_bina) #核心指标栏区域列方向分割,获得各属性列分隔位置,并截取图像数据,以列表形式存放 mb_grays, mb_binas = mainbody_segment(mb_gray, mb_bina, scale, cp_index_list) ######----------"核心指标栏":文本行分割----------######## max_col = 0 # 第一块中列数 max_row = 0 # 第一块中最大行数 mb_dic = {} # 存储每块中的文件记录 for j, bina_j in enumerate(mb_binas): if j > max_col: max_col = j gray_j = mb_grays[j] #文本行分割,获得文本行分割结果,以字典形式存储 mb_dic, row = mainbody_textline_segment( gray_j, bina_j, scale, i, j, mb_dic) if row > max_row: max_row = row max_col += 1 max_row += 1 if mb_dic is not {}: # 表单结构化初步调整:调整空白栏 mb_dics = modify_fname_dictionary(mb_dic, mb_dics) mb_block.append([i, max_col, max_row]) # 表单结构化后处理:合并多块 res_mb_dics = modify_mainbody_display(mb_dics, mb_block) # 存储经过显示调整的所有文件记录 # 去除块标识 res_mb_dics = add_flag(res_mb_dics) #根据字典数据保存分割结果,并返回分割成功标识1 save_img_from_dic(save_path, res_mb_dics) return 1
def mainbody_textline_segment(gray, bina, scale, black_id, col_id, dictionary): ''' :param gray: "核心指标栏"中某属性列灰度图 :param bina: "核心指标栏"中某属性列二值图 :param black_id: "核心指标栏"中某属性列所属块id :param col_id: "核心指标栏"中某属性列所属列id :param dictionary: 文件存储记录 :return: 文件存储记录和此属性列所含行数 ''' #排除多种干扰 bina = 1 * (gray < 0.5) bina = ocrolib.remove_noise(bina, 5) #希望排除一定的噪声干扰 scale = psegutils.estimate_scale(bina) height, width = gray.shape lines = morph.select_regions(bina, sl.dim0, min=2 * scale) #希望排除水平方向边缘处的亮斑干扰 bina = bina - lines lines = morph.select_regions(bina, sl.dim1, min=2 * scale) #希望排除垂直方向边缘处的亮斑干扰 bina = bina - lines #字符合并 textlines = filters.maximum_filter(bina, (0, scale)) textlines = morph.rb_erosion(textlines, (3, 0)) textlines = morph.rb_dilation(textlines, (0, scale)) #统计文本行位置 textpixe_num = np.sum(textlines, axis=1) textpixe_num = 1 * ((1.0 * textpixe_num / scale) > 1) textpixe_num = list(textpixe_num) text_index = [i for i, a in enumerate(textpixe_num) if a == 1] indexs = [] max_row = 0 if len(text_index) > 0: beg_index = text_index[0] end_index = text_index[0] for i in range(1, len(text_index) - 1): if text_index[i] - text_index[i - 1] != 1: end_index = text_index[i - 1] indexs.append([beg_index, end_index]) beg_index = text_index[i] end_index = text_index[i] indexs.append([beg_index, end_index]) #选取有效的文本行 results_indexs = [] if len(indexs) > 0: for index in indexs: if index[1] - index[0] >= scale / 4: results_indexs.append(index) # res_index = [] # if len(results_indexs)>0: # i=0 # beg_index=results_indexs[i][0]/2 # for i in range(len(results_indexs)-1): # end_index=(results_indexs[i][1]+results_indexs[i+1][0])/2 # res_index.append([beg_index, end_index]) # beg_index = end_index # if i==0: # end_index = (results_indexs[i][1] + height) / 2 # else: # end_index = (results_indexs[i+1][1] + height) / 2 # # res_index.append([beg_index,end_index]) for row_id, index in enumerate(results_indexs): key = '%d.%d.%d.png' % (black_id, col_id, row_id) data = 255 * gray[max(0, index[0] - 5):min(height, index[1] + 5), :] value = name_dic(index, data) dictionary[key] = value max_row = row_id return dictionary, max_row
def mainbody_segment(gray, binary, scale, index_list): ''' :param gray: :param binary: 待测的"核心指标栏" :param scale: 字符宽度, float类型 :param index_list: "标题栏"中文本沿着水平方向所在的位置列表, list类型 :return: 沿着空白区域分割的分割图, array类型 ''' # 当存在"标题栏"情况下,计算垂直空白分割位置 def search_sep_index1(bina, th, n=2): ''' :param bina:待检测图像 :param n: 匹配的模板列数 :return: 返回待测图中连续n列白点最少的位置,如果存在多个最少,则取两个最少中间的位置 ''' height, width = bina.shape beg_index = [] end_index = [] min_sum = n * height all_sum = np.sum(bina, axis=0) for i in range(0, width - 1): #以非重复方式递进 num_sum = sum(all_sum[i:i + n]) if num_sum < min_sum: min_sum = num_sum beg_index = [i, i + 1] end_index = [i, i + 1] elif num_sum == min_sum: end_index = [i, i + 1] if len(beg_index) > 0 and len(end_index) > 0: res_index = (beg_index[1] + end_index[0]) / 2 if np.sum(bina[:, res_index]) < th: #白像素个数小于一定数目,才认为是真的分割位置 return res_index else: return None # 当不存在"标题栏"情况下,计算垂直白色空白位置 def search_sep_index2(bina, scale): ''' :param binary:待检测的"核心指标"图,array类型 :param scale: 字体宽度, float类型 :return: 返回待测图中各空白区域处的中间位置, int类型 ''' indexs = np.sum(bina, axis=0) indexs = list(1.0 * indexs / scale) # 排除噪声干扰:当某列中像素点数小于一定量时候,排除干扰 text_index_temp = [i for i, index in enumerate(indexs) if index > 1] # 候选文本位置列表 text_index_acct = [] # 真正的文本位置列表 if len(text_index_temp) > 0: beg_index = text_index_temp[0] end_index = text_index_temp[0] for i in range(1, len(text_index_temp)): end_index = text_index_temp[i] if text_index_temp[i] - text_index_temp[ i - 1] > 4: # 当文本间隔超过一定阈值时候,才认为文本从新开始 end_index = text_index_temp[i - 1] if end_index - beg_index > scale: # 当文本宽度大于一个字符跨度时候,才认为是真正的文本 text_index_acct.append([beg_index, end_index]) beg_index = text_index_temp[i] text_index_acct.append([beg_index, end_index]) text_index_acct.sort(key=lambda x: x[0]) res_index = [] for i in range(len(text_index_acct) - 1): index = (text_index_acct[i + 1][0] + text_index_acct[i][1]) / 2 res_index.append(index) return res_index # import pdb # pdb.set_trace() # 排除边界处干扰部分 bina = ocrolib.remove_noise(binary, 8) lines = morph.select_regions(bina, sl.dim1, min=2 * scale) bina = bina - lines lines = morph.select_regions(bina, sl.dim0, min=2 * scale) bina = bina - lines # 存在"标题栏" if 6 > len(index_list) > 3: colsep_index = [] # 线性扩张:白色区域变大 bina_d = filters.maximum_filter(bina, (scale, scale)) i = 0 while len(index_list): # 取标题栏中对应的连续两个位置的中间值,组成新的位置,作为待测位置 # eg:假设"标题栏"中两个相邻的文本区域,在对水平方向应的位置分别是是[x00,x01],[x10,x11], # 则:分割线应该出现在"核心指标栏"中水平方向[(x00+x01)/2, (x10+x11)/2]范围内. # TODO:此处有个bug,即当标题栏只有一个属性的情况下.该如何分割指标区域 sep_index = None while sep_index is None and i < len(index_list) - 1: index = [(index_list[i][0] + index_list[i][1]) / 2, (index_list[i + 1][0] + index_list[i + 1][1]) / 2] bina_i = bina_d[:, index[0]:index[1]] sep_index = search_sep_index1(bina_i, 10 * scale) # 返回计算得到分割位置 if sep_index is None: # 意味着标题栏初始分割失败 index_re1 = index_list[i] index_re2 = index_list[i + 1] index_new = [index_re1[0], index_re2[1]] index_list.remove(index_re1) index_list.remove(index_re2) index_list.insert(i, index_new) if len(colsep_index) > 0: b = colsep_index.pop() if i > 0: i = i - 1 if sep_index is not None: sep_index = sep_index + index[0] colsep_index.append(sep_index) if i > 0: i = i - 1 index_list.remove(index_list[i]) else: index_list.remove(index_list[0]) i += 1 # 不存在"标题栏" else: bina_d = filters.maximum_filter(bina, (scale, scale / 2)) #改为2*scale? colsep_index = search_sep_index2(bina_d, scale) colsep_index.append(0) colsep_index.append(bina.shape[1]) colsep_index.sort(key=lambda x: x) # 返回最终的文本位置列表 res_index = [] for i in range(len(colsep_index) - 1): beg_index = colsep_index[i] end_index = colsep_index[i + 1] res_index.append([beg_index, end_index]) bina_lists = [] gray_lists = [] for index in res_index: gray_i = gray[:, index[0]:index[1]] bina_i = binary[:, index[0]:index[1]] gray_lists.append(gray_i) bina_lists.append(bina_i) # plt.imshow(bina_i, 'gray'), plt.show() return gray_lists, bina_lists
def process(job): imagepath, i = job global base base, _ = ocrolib.allsplitext(imagepath) outputdir = base imagename_base = os.path.basename(os.path.normpath(base)) try: binary = ocrolib.read_image_binary(imagepath) except IOError: if ocrolib.trace: traceback.print_exc() print_error("cannot open either %s.bin.png or %s" % (base, imagepath)) return checktype(binary, ABINARY2) if not args['nocheck']: check = check_page(amax(binary) - binary) if check is not None: print_error("%s SKIPPED %s (use -n to disable this check)" % (imagepath, check)) return binary = 1 - binary # invert if args['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = args['scale'] print_info("scale %f" % (scale)) if isnan(scale) or scale > 1000.0: print_error("%s: bad scale (%g); skipping\n" % (imagepath, scale)) return if scale < args['minscale']: print_error("%s: scale (%g) less than --minscale; skipping\n" % (imagepath, scale)) return # find columns and text lines if not args['quiet']: print_info("computing segmentation") segmentation = compute_segmentation(binary, scale) if amax(segmentation) > args['maxlines']: print_error("%s: too many lines %g" % (imagepath, amax(segmentation))) return if not args['quiet']: print_info("number of lines %g" % amax(segmentation)) # compute the reading order if not args['quiet']: print_info("finding reading order") lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = amax(segmentation) + 1 renumber = zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] # finally, output everything if not args['quiet']: print_info("writing lines") if not os.path.exists(outputdir): os.mkdir(outputdir) lines = [lines[i] for i in lsort] ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation) cleaned = ocrolib.remove_noise(binary, args['noise']) for i, l in enumerate(lines): binline = psegutils.extract_masked(1 - cleaned, l, pad=args['pad'], expand=args['expand']) ocrolib.write_image_binary( "%s/%s_01%04x.bin.png" % (outputdir, imagename_base, i + 1), binline) print_info("%6d %s %4.1f %d" % (i, imagepath, scale, len(lines))) return outputdir
def process1(job): fname, i = job global base base, _ = ocrolib.allsplitext(fname) outputdir = base try: binary = ocrolib.read_image_binary(base + ".bin.png") except IOError: try: binary = ocrolib.read_image_binary(fname) except IOError: if ocrolib.trace: traceback.print_exc() print("cannot open either", base + ".bin.png", "or", fname) return checktype(binary, ABINARY2) if not args.nocheck: check = check_page(amax(binary) - binary) if check is not None: print(fname, "SKIPPED", check, "(use -n to disable this check)") return if args.gray: if os.path.exists(base + ".nrm.png"): gray = ocrolib.read_image_gray(base + ".nrm.png") checktype(gray, GRAYSCALE) binary = 1 - binary # invert if args.scale == 0: scale = psegutils.estimate_scale(binary) else: scale = args.scale print("scale", scale) if isnan(scale) or scale > 1000.0: sys.stderr.write("%s: bad scale (%g); skipping\n" % (fname, scale)) return if scale < args.minscale: sys.stderr.write("%s: scale (%g) less than --minscale; skipping\n" % (fname, scale)) return # find columns and text lines if not args.quiet: print("computing segmentation") segmentation = compute_segmentation(binary, scale) if amax(segmentation) > args.maxlines: print(fname, ": too many lines", amax(segmentation)) return if not args.quiet: print("number of lines", amax(segmentation)) # compute the reading order if not args.quiet: print("finding reading order") lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = amax(segmentation) + 1 renumber = zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] # finally, output everything if not args.quiet: print("writing lines") if not os.path.exists(outputdir): os.mkdir(outputdir) lines = [lines[i] for i in lsort] ocrolib.write_page_segmentation("%s.pseg.png" % outputdir, segmentation) cleaned = ocrolib.remove_noise(binary, args.noise) for i, l in enumerate(lines): binline = psegutils.extract_masked(1 - cleaned, l, pad=args.pad, expand=args.expand) ocrolib.write_image_binary("%s/01%04x.bin.png" % (outputdir, i + 1), binline) if args.gray: grayline = psegutils.extract_masked(gray, l, pad=args.pad, expand=args.expand) ocrolib.write_image_gray("%s/01%04x.nrm.png" % (outputdir, i + 1), grayline) print("%6d" % i, fname, "%4.1f" % scale, len(lines))
def _process_segment(self, page_image, page, region_xywh, page_id, input_file, n): binary = ocrolib.pil2array(page_image) binary = np.array(1 - binary / np.amax(binary), 'B') if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1: min_x, max_x = (0, binary.shape[0]) min_y, max_y = (0, binary.shape[1]) textregion = TextRegionType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) page.add_TextRegion(textregion) else: textregion = page.get_TextRegion()[-1] ocrolib.write_image_binary("test.bin.png", binary) if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale)) return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("%s: too many lines %i", (fname, np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) region_xywh['features'] += ",textline" for i, l in enumerate(lines): ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0], l.bounds[1]]) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) img = binary[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = ocrolib.array2pil(img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line = TextLineType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) line.add_AlternativeImage(ai) textregion.add_TextLine(line)