def get_content(api, img_path: str, image_crop_pram: Tuple[int, int, int, int], tracker) -> \ List[Tuple[str, float]]: # first we do some pre-processing on the image img = Image.open(img_path) # convert to gray scale and apply binarization img = img.convert("L").point(lambda x: 0 if x < 180 else 255, "1") if image_crop_pram is not None: img = img.crop(image_crop_pram) # use tesseract to recognize the texts api.SetImage(img) api.Recognize() # build the data page_text = [] ri = api.GetIterator() level = RIL.TEXTLINE for r in iterate_level(ri, level): try: line = r.GetUTF8Text(level) conf = r.Confidence(level) # process the text, remove the space, and newline line = line.replace(" ", "").replace("\n", "") page_text.append((line, conf)) except RuntimeError as e: tracker.log("No text Returned on this line.", tp=TRACKER_LOG_ERROR, exc_info=e) return page_text
def iterate_lines(img): with PyTessBaseAPI(psm=PSM.AUTO, oem=OEM.LSTM_ONLY) as api: api.SetImageFile(img) text = api.GetUTF8Text() if text == '': data = {"status": "Succeeded", "recognitionResult": {"fullTetx": text, "lines": []}} else: data = {"status": "Succeeded", "recognitionResult": {"fullTetx": text, "lines": []}} api.Recongise() ri = api.GetIterator() level = RTL.TEXTLINE for r in iterate_level(ri, level): line = r.GetUTF8Text(level) line_post = line.replace('\n','') line_post = ' '.join(i for i in line_post.split(' ') if i != '') if line.isspace(): continue conf = r.Confidence(level) bbbox = r.BoundingBox(level) line_boundingBox = convert_boundingBox(bbox) data['recognitionResult']['lines'].append({'text': line_post, 'boundingBox': line_boundingBox}) return data
def tsOcrText(self,tpl,text_features,x1,y1,x2,y2,lang='chi_sim',psm=7, oem=1): _data_list = list() tpl = tpl[y1:y2,x1:x2] tpl = cv2.cvtColor(tpl,cv2.COLOR_RGB2GRAY) img = cv2.adaptiveThreshold(tpl,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2) #经过测试高斯识别效果好 #numpy转换成PIL格式 img = Image.fromarray(img) #img.show() with PyTessBaseAPI(lang='chi_sim',psm=7, oem=1) as api: level = RIL.TEXTLINE #以标题为主 #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png") api.SetImage(img) api.Recognize() ri = api.GetIterator() for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) #相似度 if symbol: pass #print('symbol {0} conf: {1}'.format(symbol, conf)) boxes = r.BoundingBox(level) #xy等等坐标 dict_= {"text":symbol,"left":boxes[0],"top":boxes[1],"weight":boxes[2],"weight":boxes[3]} _data_list.append(dict_) except Exception as e: print("没有字符") xz = list() for idx, data in enumerate(_data_list): for text in text_features: if text in data["text"]: x = data["left"] + x1 y = data["top"] + y1 xz.append((data["text"],x,y)) #print("识别结果:{0}".format(xz)) return xz
def extract_data_from_image(filename): print(filename) bboxPrev = None coldist = None pdfPageDf = pd.DataFrame( columns=['trueline', 'lineitem', 'bbox', 'coldist']) with PyTessBaseAPI(path="C:\\Program Files (x86)\\Tesseract-OCR\\tessdata", psm=PSM.SPARSE_TEXT_OSD) as api: api.SetImageFile(filename) api.Recognize() ri = api.GetIterator() level = RIL.TEXTLINE for r in iterate_level(ri, level): pdfLine = r.GetUTF8Text(level) trueline = True bbox = r.BoundingBoxInternal(level) if not bboxPrev == None: if abs(bbox[1] - bboxPrev[1]) <= 15: trueline = False coldist = dist(np.array([bbox[0], bbox[1]]), np.array([bboxPrev[0], bboxPrev[1]])) bboxPrev = bbox pdfPageDf = pdfPageDf.append( { "trueline": trueline, "lineitem": pdfLine, "bbox": bbox, "coldist": coldist }, ignore_index=True) # print(pdfLine.strip(),bbox) return pdfPageDf
def extract_text(self, image): """ Given an arbitrary RGB image in numpy array format, return a list of all the words detected in the text, along with their bounding box coordinates. The returned list contains tuples in this format: (word, x1, y1, x2, y2) """ # Convert the numpy array image to a Pillow-friendly format pillow_img = Image.fromarray(image) output = [] # Open the Tesseract context, specifiying SPARSE_TEXT as an option (used to highlight # single words, rather than lines of text ) with PyTessBaseAPI(psm=PSM.SPARSE_TEXT) as api: api.SetVariable("save_blob_choices", "T") api.SetImage(pillow_img) api.Recognize() ri = api.GetIterator() level = RIL.WORD # Cycle through the words, populating the list for r in iterate_level(ri, level): word = r.GetUTF8Text(level) conf = r.Confidence(level) box = r.BoundingBox(level) if word and conf > MIN_CONFIDENCE: entry = (word, box[0], box[1], box[2], box[3], conf) output.append(entry) return output
def perform_ocr(self, x_offset=0, y_offset=0, pad_offset=None): dpi = 300 api = self.api api.Recognize() ri = api.GetIterator() words = [] level = RIL.WORD for r in iterate_level(ri, level): try: word = r.GetUTF8Text(level) font_info = r.WordFontAttributes() bbox = list(r.BoundingBox(level)) bbox = [float(b) for b in bbox] bbox = [float(b) * 72 / dpi for b in bbox] bbox[0] += x_offset bbox[2] += x_offset bbox[1] += y_offset bbox[3] += y_offset if pad_offset is not None: bbox[0] += pad_offset[0] bbox[1] += pad_offset[1] bbox[2] += pad_offset[0] bbox[3] += pad_offset[1] word = word.rstrip().lstrip() if word: bbox.append(font_info) bbox.append(word) words.append(bbox) except Exception as e: pass words = sorted(words, key=lambda x: (x[1], x[0])) return words
def classifier_choices(): api.SetImageFile('/home/johannes/Repos/tesseract/testing/phototest.tif') api.SetVariable("save_blob_choices", "T") api.SetRectangle(37, 228, 548, 31) api.Recognize() ri = api.GetIterator() level = RIL.SYMBOL # level = RIL.PARA paragraj for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) test = r.SetLineSeparator('\a') lang = r.WordRecognitionLanguage() if symbol: print(u'symbol {}, conf: {}'.format(symbol, conf)) indent = False ci = r.GetChoiceIterator() for c in ci: if indent: print('\t\t ') print('\t- ') choice = c.GetUTF8Text() # c == ci print(u'{} conf: {}'.format(choice, c.Confidence())) indent = True ci.Next() # j4t print('---------------------------------------------')
def upload(request: ImageModel): msg = base64.b64decode(request.base64) buf = io.BytesIO(msg) image = Image.open(buf) with PyTessBaseAPI(oem=OEM.LSTM_ONLY) as api: api.SetImage(image) api.Recognize() api.SetVariable("save_blob_choices", "T") ri = api.GetIterator() level = RIL.TEXTLINE boxes = api.GetComponentImages(RIL.TEXTLINE, True) text_list = [] i = 0 for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) conf = r.Confidence(level) bbox = r.BoundingBoxInternal(level) im = { "text": symbol, "left": bbox[0], "top": bbox[1], "width": bbox[2] - bbox[0], "height": bbox[3] - bbox[1], } text_list.append(im) i += 1 return { "texts": text_list, }
def __decode_words(self, iterator): words = [] for tesseract_word in iterate_level(iterator, RIL.WORD): font_attributes = tesseract_word.WordFontAttributes() word = Word() word.bounding_box = BoundingBox.from_coordinates( *tesseract_word.BoundingBox(RIL.WORD)) word.confidence = float(tesseract_word.Confidence( RIL.WORD)) / 100.0 word.text = tesseract_word.GetUTF8Text(RIL.WORD) word.symbols = self.__decode_symbols(iterator) font = Font() font.bold = font_attributes['bold'] font.italic = font_attributes['italic'] font.underline = font_attributes['underlined'] font.monospace = font_attributes['monospace'] font.serif = font_attributes['serif'] font.pointsize = font_attributes['pointsize'] font.id = font_attributes['font_id'] for symbol in word.symbols: symbol.font = font words.append(word) if iterator.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break return words
def Ocrtext(self, scx_rgb, x1, y1, x2, y2): image_array1 = self.__Ocr(scx_rgb, x1, y1, x2, y2) #self.show(image_array1) _data_list = list() with PyTessBaseAPI(lang='chi_sim', psm=7, oem=1) as api: level = RIL.TEXTLINE #以标题为主 #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png") img = Image.fromarray(image_array1) #img.show() api.SetImage(img) api.Recognize() ri = api.GetIterator() for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) # r == ri r.Confidence(level) #相似度 if symbol: pass #print('symbol {0} conf: {1}'.format(symbol, conf)) boxes = r.BoundingBox(level) #xy等等坐标 dict_ = { "text": symbol, "left": boxes[0], "top": boxes[1], "weight": boxes[2], "weight": boxes[3] } _data_list.append(dict_) except Exception as e: print("没有字符") return _data_list
def get_region(self, xml_box, padding): api = self.api ri = api.GetIterator() words = [] level = RIL.WORD for r in iterate_level(ri, level): try: word = r.GetUTF8Text(level) bbox = list(r.BoundingBox(level)) bbox = [float(b) for b in bbox] bbox[0] += padding // 2 bbox[2] -= padding // 2 bbox[1] += padding // 2 bbox[3] -= padding // 2 bbox = [float(b) * 72 / 300 for b in bbox] bbox[0] += xml_box[0] bbox[2] += xml_box[0] bbox[1] += xml_box[1] bbox[3] += xml_box[1] w = word.rstrip().lstrip() if len(w) > 0: bbox.append(w) words.append(bbox) except Exception as e: pass return words
def iterate_words(img): with PyTessBaseAPI(psm=PSM.AUTO, oem=OEM.LSTM_ONLY) as api: api.SetImageFile(img) text = api.GetUTF8Text() if text == '': data = {} else: data = {'text': text, 'words': []} api.Recongise() ri = api.GetIterator() level = RTL.WORD for r in iterate_level(ri, level): word = r.GetUTF8Text(level) if word.isspace(): continue conf = r.Confidence(level) word_bbox = r.BoundingBox(level) word_boundingBox = convert_boundingBox(word_bbox) data['words'].append({'text': word, 'boundingBox': word_boundingBox, 'confidence': conf}) return data
def image_to_string(img, lang): with tesserocr.PyTessBaseAPI(lang=lang, psm=3) as api: api.SetVariable("tessedit_char_whitelist", " \n" + CHARSET) api.SetImage(img) api.Recognize() words = [] level = tesserocr.RIL.WORD for r in tesserocr.iterate_level(api.GetIterator(), level): try: word = r.GetUTF8Text(level) except RuntimeError: continue conf = r.Confidence(level) # print(f"{word} ({conf})") if words: previous = words[-1] if regex.match(r"[\p{Lu}\p{Ll}]+\-$", previous): # dash=>combine words[-1] = words[-1][:-1] + word continue # if conf > 0.95 or (all(c in LETTERS for c in word) and conf > 0.9): # words.append(word) # continue # # print(f"LOWCONF! {word} ({conf})") words.append(word) return filter(bool, words)
def _line_height(self, polygon): key = tuple(polygon.centroid.coords[0]) if key not in self._ocr: from .utils import polygons_to_mask mask = polygons_to_mask(self._unbinarized.shape, [polygon]) minx, miny, maxx, maxy = polygon.bounds minx, miny = numpy.floor(numpy.array([minx, miny])).astype(numpy.int32) maxx, maxy = numpy.ceil(numpy.array([maxx, maxy])).astype(numpy.int32) pixels = self._unbinarized[miny:maxy, minx:maxx] mask = mask[miny:maxy, minx:maxx] pixels[numpy.logical_not(mask)] = 255 with tesserocr.PyTessBaseAPI( psm=tesserocr.PSM.SINGLE_BLOCK) as api: api.SetImage(PIL.Image.fromarray(pixels, "L")) heights = [] for i, data in enumerate(api.GetTextlines()): bbox = data[1] heights.append(bbox["h"]) if heights: n_lines = len(heights) lh = numpy.min(heights) else: lh = maxy - miny n_lines = 1 if self._debug: api.Recognize() ri = api.GetIterator() level = tesserocr.RIL.TEXTLINE text = "" #lines = [] for r in tesserocr.iterate_level(ri, level): #baseline = r.Baseline(level) #if baseline: # p1, p2 = baseline # lines.append(shapely.geometry.LineString([p1, p2])) try: text += r.GetUTF8Text(level) + " " except RuntimeError: pass #print("txt", text.strip(), "lh", lh, "#", n_lines) else: text = "" self._ocr[key] = (n_lines, lh, text) return self._ocr[key]
def processa(path='imagem.jpg'): locale.setlocale(locale.LC_ALL, 'C') with PyTessBaseAPI(lang='por') as api: start_time = time.time() print('pre') c = cropa(path) print('pro') try: if '.png' in path: api.SetImageFile('tmp.png') elif '.jpeg' in path: api.SetImageFile('tmp.jpeg') else: api.SetImageFile('tmp.jpg') except RuntimeError: api.SetImageFile(path) api.SetVariable("save_blob_choices", "T") """" #ima = cv2.imread(path) #ima = cv2.resize(ima, (1000,900)) #api.SetImage(Image.fromarray(ima)) lines = api.GetTextlines() print(list(lines)) for im in lines: #ia= cv2.rectangle(ima,(im[1]['x'], im[1]['y']),(im[1]['x'] + im[1]['w'],im[1]['y'] + im[1]['h']),(0,255,0),3) #cv2.imshow("kk", ia) #cv2.waitKey(0) api.SetRectangle(im[1]['x'], im[1]['y'], im[1]['w'], im[1]['h']) api.Recognize() print(api.GetUTF8Text()) """ api.Recognize() ri = api.GetIterator() level = RIL.TEXTLINE lines = [] #print(' '.join(word for word in api.AllWords())) for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) #print(symbol, end='') if symbol.strip(): lines.append(symbol.strip()) #print(api.GetUTF8Text()) #print(lines) text = '\n'.join(lines) print(text) #text = api.GetUTF8Text() locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8') if text != None: #text = unidecode.unidecode(text) file = open('textscanner.txt', 'w') file.write(text) file.close() os.system('python2 translator.py textscanner.txt') print("Elapsed time: {}".format(time.time() - start_time))
def orcTitle(path): # make first page into jpeg page = convert_from_path(path, first_page=0, last_page = 1)[0] page.save(temp_file, 'JPEG') # use ocr to extract title # image = Image.open(temp_file) with PyTessBaseAPI() as api: api.SetImageFile(temp_file) api.Recognize() # required to get result from the next line ri = api.GetIterator() # loop through and find largest text size level = RIL.TEXTLINE maxSize = 0 for r in iterate_level(ri, level): # extract line of text text = r.GetUTF8Text(level) # get line's font size fontSize = r.WordFontAttributes()['pointsize'] # check to see if current max # remove extra spaces/newlines/tabs (etc.) when testing min length req if len(''.join(text.split())) > 1 and fontSize > maxSize: maxSize = fontSize # loop through again and concatenate largest words ri = api.GetIterator() level = RIL.TEXTLINE title_list = [] for r in iterate_level(ri, level): text = r.GetUTF8Text(level) fontSize = ri.WordFontAttributes()['pointsize'] if len(''.join(text.split())) > 1 and fontSize > maxSize - 15: # add title words to list title_list.extend(r.GetUTF8Text(level).split()) # concatenate them back together title = ' '.join(title_list) os.remove(temp_file) return title
def get_text_from_box(fn, x, y, w, h): """ Functionality: given the bounding box, find the word(s) within; assumes the box is good enough For debugging purpose, this function will draw the bounding box where Tesseract sees the word and save to debug_output.png Args: image: PIL image object x: x coordinate of the upper left corner of the bounding box y: y coordinate of the upper left corner of the bounding vox w: width of the bounding box h: height of the bounding box Returns: a list of word objects (but did not set id) """ # print (fn) image = Image.open(fn) Arr = np.array(image) boxes = [] words = [] with PyTessBaseAPI() as api: api.SetImage(image) api.SetVariable("save_blob_choices", "T") api.SetRectangle(x, y, w, h) api.Recognize() ri = api.GetIterator() level = RIL.WORD counter = 0 for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) conf = r.Confidence(level) bbox = r.BoundingBox(level) w = word.Word(None, None, None, None, None, None, None) w.confidence = conf w.text = symbol w.x = bbox[0] w.y = bbox[1] w.width = bbox[2] - bbox[0] w.height = bbox[3] - bbox[1] words.append(w) # print (w.text) outim = Image.fromarray(Arr[bbox[1]:bbox[3], bbox[0]:bbox[2]]) #debugging purpose only... if symbol: print(symbol + " " + str(conf)) # print (bbox) outim.save(str(counter) + ' debug.png') counter += 1 except RuntimeError: print('No text returned') continue return words
def __decode_lines(self, iterator): lines = [] for tesseract_line in iterate_level(iterator, RIL.TEXTLINE): line = TextLine() line.bounding_box = BoundingBox.from_coordinates( *tesseract_line.BoundingBox(RIL.TEXTLINE)) line.words = self.__decode_words(iterator) lines.append(line) if iterator.IsAtFinalElement(RIL.PARA, RIL.TEXTLINE): break return lines
def __decode_paragraphs(self, iterator): paragraphs = [] for tesseract_paragraph in iterate_level(iterator, RIL.PARA): paragraph = Paragraph() paragraph.bounding_box = BoundingBox.from_coordinates( *tesseract_paragraph.BoundingBox(RIL.PARA)) paragraph.lines = self.__decode_lines(iterator) paragraphs.append(paragraph) if iterator.IsAtFinalElement(RIL.BLOCK, RIL.PARA): break return paragraphs
def Ocrtext(self, scx_rgb, x1, y1, x2, y2, ril=RIL.TEXTLINE, lang='chi_sim', psm=7, oem=1, attribute=None, THRESH_GAUSSIAN=False): if THRESH_GAUSSIAN: tpl = self.Print_screen() tpl = tpl[y1:y2, x1:x2] tpl = cv2.cvtColor(tpl, cv2.COLOR_RGB2GRAY) image_array1 = cv2.adaptiveThreshold( tpl, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) else: image_array1 = self.__Ocr(scx_rgb, x1, y1, x2, y2) #self.show(image_array1) _data_list = list() with PyTessBaseAPI(lang=lang, psm=psm, oem=oem) as api: level = ril #以标题为主 #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png") img = Image.fromarray(image_array1) if attribute: api.SetVariable(attribute[0], attribute[1]) #img.show() api.SetImage(img) api.Recognize() ri = api.GetIterator() for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) # r == ri r.Confidence(level) #相似度 if symbol: pass #print('symbol {0} conf: {1}'.format(symbol, conf)) boxes = r.BoundingBox(level) #xy等等坐标 dict_ = { "text": symbol, "left": boxes[0], "top": boxes[1], "weight": boxes[2], "weight": boxes[3] } _data_list.append(dict_) except Exception as e: print("没有字符") return _data_list
def __decode_blocks(self, iterator, image): blocks = [] for tesseract_block in iterate_level(iterator, RIL.BLOCK): block = Block() block.bounding_box = BoundingBox.from_coordinates( *tesseract_block.BoundingBox(RIL.BLOCK)) if not tesseract_block.GetUTF8Text(RIL.BLOCK).strip(): block.image = tesseract_block.GetImage(RIL.BLOCK, 0, image) blocks.append(block) continue block.paragraphs = self.__decode_paragraphs(iterator) blocks.append(block) return blocks
def get_font(image_path): with PyTessBaseAPI() as api: api.SetImageFile(image_path) api.Recognize() ri = api.GetIterator() level = RIL.SYMBOL print(ri) for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) word_attributes = r.WordFontAttributes() print(word_attributes) if symbol: print(u'symbol {}, font: {}'.format( symbol, word_attributes['font_name']))
def reconhece(nomeArquivo): '''reconhece caracteres''' #return tesserocr.file_to_text(nomeArquivo) dados = '' # todos = '' imageDados = Image.open(nomeArquivo) w, h = imageDados.size slice_w = w / 4 slice_w = slice_w * 4 quartenario = [0, 0, slice_w, h] try: with PyTessBaseAPI() as api: api.SetImageFile(nomeArquivo) #api.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVXWYZabcdefghijklmnopqrstuvwxyz") api.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") api.SetVariable("save_blob_choices", "T") #api.SetVariable("tessedit_unrej_any_wd", True) api.SetRectangle(quartenario[0], quartenario[1], quartenario[2], quartenario[3]) api.Recognize() ri = api.GetIterator() level = RIL.SYMBOL for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) if symbol: dados = dados + symbol #todos = todos +'[{0} : {1:2.2f}], '.format(symbol, conf) # ci = r.GetChoiceIterator() # for c in ci: # choice = c.GetUTF8Text() # todos = todos +'[{0} : {1:2.2f}], '.format(choice, c.Confidence()) # todos = todos + '\n' except Exception as exp: raise exp return dados
def get_font(image_path): with PyTessBaseAPI() as api: api.SetImageFile(image_path) api.Recognize() iter = api.GetIterator() level = RIL.SYMBOL for r in iterate_level(iter, level): symbol = r.GetUTF8Text(level) word_attributes = r.WordFontAttributes() # a = PyResultIterator.WordFontAttributes() if symbol: # name = word_attributes['font_name'] print(f'symbol {symbol}, font: {word_attributes}')
def __decode_symbols(self, iterator): symbols = [] for tesseract_symbol in iterate_level(iterator, RIL.SYMBOL): symbol = Symbol() symbol.bounding_box = BoundingBox.from_coordinates( *tesseract_symbol.BoundingBox(RIL.SYMBOL)) symbol.confidence = float(tesseract_symbol.Confidence( RIL.SYMBOL)) / 100.0 symbol.text = tesseract_symbol.GetUTF8Text(RIL.SYMBOL) symbol.image = tesseract_symbol.GetBinaryImage(RIL.SYMBOL).convert( '1', dither=Image.NONE) symbols.append(symbol) if iterator.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break return symbols
def classiferChoices(ri): level = RIL.SYMBOL for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) if symbol: print u'symbol {}, conf: {}'.format(symbol, conf), indent = False ci = r.GetChoiceIterator() for c in ci: if indent: print '\t\t ', print '\t- ', choice = c.GetUTF8Text() # c == ci print u'{} conf: {}'.format(choice, c.Confidence()) indent = True print '---------------------------------------------'
def test_result_iterator(self): """Test result iterator.""" self._api.SetImageFile(self._image_file) self._api.Recognize() it = self._api.GetIterator() level = tesserocr.RIL.WORD for i, w in enumerate(tesserocr.iterate_level(it, level)): text = w.GetUTF8Text(level) blanks = w.BlanksBeforeWord() if i == 0: self.assertEqual(text, "The") self.assertEqual(blanks, 0) elif i == 1: self.assertEqual(text, "(quick)") self.assertEqual(blanks, 1) else: break
def tesseract(path, filename, conf_dir, text_dir): # OCR - use the Tesseract API through Cython and PyTesseract with PyTessBaseAPI() as api: pathFilename = path + "/" + filename label_text = "" ri = None try: # Set the image api.SetImageFile(pathFilename) # Run and verify the recognition process label_text = api.GetUTF8Text() label_text = label_text[:-1] api.SetVariable("save_blob_choices", "T") api.Recognize() ri = api.GetIterator() except: return conf_text = "" # Iterate over each of the symbols of the file level = RIL.SYMBOL for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) conf = 0.01 * r.Confidence(level) # We only save non-break symbols if (symbol not in ['\n', '\r', '\t', '\f']): conf_text += symbol + "\t" + str(conf) + "\n" except: continue if len(conf_text) > 0: basename = filename[:-4] # Write all the characters and their Confidence in the probabilities file conf_pathFilename = conf_dir + "/" + basename + ".prob" with open(conf_pathFilename, "w") as f: f.write(conf_text.encode('utf-8')) # Write the recognized text line in the text file text_pathFilename = text_dir + "/" + basename + ".txt" with open(text_pathFilename, "w") as f: f.write(label_text.encode('utf-8'))
def symbolConfidenc(img): word = '' count = 0 insertSpace = 'false' with PyTessBaseAPI() as api: api.SetImageFile(img) api.Recognize() ri = api.GetIterator() #levelTwo = RIL.TEXTLINE level = RIL.WORD for r in iterate_level(ri, level): #space = r.GetUTF8Text(levelTwo)#gets whole line includes everything unlike RIL.SYMBOL symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) if conf > 50: word = word + ' ' + symbol return word
def find_word_attribute(image, tessdata_3_path): #Reading image raw_img = Image.open(image) #Scaling image img = scale_image(raw_img) #Initializing parameters word_arr = [] bold_arr = [] #Using TessBaseAPI to read the fond attribute with PyTessBaseAPI(path=tessdata_3_path) as api: api.SetImage(img) api.Recognize(0) #print(api.GetUTF8Text()) ri = api.GetIterator() level = RIL.WORD for r in iterate_level(ri, level): bb = r.BoundingBox(level) if bb != None: word = r.GetUTF8Text(level) #word_arr.append(word) font_name = r.WordFontAttributes() #attr.append(font_name) if word != None and font_name != None: word_arr.append(word) bold_arr.append(font_name) Lang_name = r.WordRecognitionLanguage() bool_value = r.WordIsFromDictionary() conf = r.Confidence(level) df1 = pd.DataFrame(word_arr) df2 = pd.DataFrame(bold_arr) df = pd.concat([df1, df2], axis=1) df.rename(columns={df.columns[0]: "Word"}, inplace=True) return (df)
def get_boxes(image_filename: str) -> list: image = Image.open(image_filename) width = image.width height = image.height max_width = width // 2 max_height = height // 2 api = PyTessBaseAPI(lang="jpn_vert") # api.ReadConfigFile("tess.conf") api.SetPageSegMode(PSM.SPARSE_TEXT_OSD) api.SetImage(image) api.Recognize(0) ri = api.GetIterator() level = RIL.WORD boxes = [] for r in iterate_level(ri, level): conf = r.Confidence(level) text = r.GetUTF8Text(level) left, top, right, bottom = r.BoundingBox(level) # boxes = api.GetComponentImages(RIL.SYMBOL, True) # for im, rect, _, _ in boxes: # # im is a PIL image object # # rect is a dict with x, y, w and h keys # left, top, right, bottom = rect['x'], rect['y'], rect['w'], rect['h'] # api.SetRectangle(left, top, right, bottom) # text = api.GetUTF8Text() # conf = api.MeanTextConf() print("'%s' \tConf: %.2f \tCoords: %d,%d,%d,%d" % (text, conf, left, top, right, bottom)) box = { 'text': text, 'left': left, 'top': top, 'width': right - left, 'height': bottom - top } if should_ignore_box(conf, box, max_width, max_height): continue boxes.append(box) api.End() image.close() return boxes
w = line[1]['w'] h = line[1]['h'] img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 3) result = Image.fromarray(img) print "Shape of the original image: " print result.size # result.show() # img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 3 # print api.GetBoxText() # api.GetThresholdedImage().show() iterator = api.GetIterator() iterator.Begin() level = RIL.SYMBOL for r in iterate_level(iterator, level): # print r.BoundingBox(level) x = r.BoundingBox(level)[0] y = r.BoundingBox(level)[1] x_2 = r.BoundingBox(level)[2] y_2 = r.BoundingBox(level)[3] img = cv2.rectangle(img, (x, y), (x_2, y_2), (0, 255, 0), 3) # Draw a green rectangle around each character found by OCR out = Image.fromarray(img) #out.show() out.save("out.png") f.close() # Need to kill iterator to clear memory==== # Want to show the bounding box of L1 of the SKU: