def get_text_from_box(fn, x, y, w, h): """ Functionality: given the bounding box, find the word(s) within; assumes the box is good enough For debugging purpose, this function will draw the bounding box where Tesseract sees the word and save to debug_output.png Args: image: PIL image object x: x coordinate of the upper left corner of the bounding box y: y coordinate of the upper left corner of the bounding vox w: width of the bounding box h: height of the bounding box Returns: a list of word objects (but did not set id) """ # print (fn) image = Image.open(fn) Arr = np.array(image) boxes = [] words = [] with PyTessBaseAPI() as api: api.SetImage(image) api.SetVariable("save_blob_choices", "T") api.SetRectangle(x, y, w, h) api.Recognize() ri = api.GetIterator() level = RIL.WORD counter = 0 for r in iterate_level(ri, level): try: symbol = r.GetUTF8Text(level) conf = r.Confidence(level) bbox = r.BoundingBox(level) w = word.Word(None, None, None, None, None, None, None) w.confidence = conf w.text = symbol w.x = bbox[0] w.y = bbox[1] w.width = bbox[2] - bbox[0] w.height = bbox[3] - bbox[1] words.append(w) # print (w.text) outim = Image.fromarray(Arr[bbox[1]:bbox[3], bbox[0]:bbox[2]]) #debugging purpose only... if symbol: print(symbol + " " + str(conf)) # print (bbox) outim.save(str(counter) + ' debug.png') counter += 1 except RuntimeError: print('No text returned') continue return words
def get_word_data(img): image = Image.open(img, mode='r') pdf = pdfpage.PDFPage('folder location', 1) with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages( RIL.WORD, True) # option for TEXTLINE or SYMBOL (character) as well for i, (im, box, _, _) in enumerate(boxes): api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text() conf = api.MeanTextConf() doc_word = word.Word(i, box['x'], box['y'], box['w'], box['h'], conf, ocrResult) # print ((u"Box[{0}]: x={x}, y={y}, w={w}, h={h}, ""confidence: {1}, text: {2}").format(i, conf, ocrResult, **box)) pdf.add_word(doc_word) pdf.sort_dictionaries() return pdf
def add_field(self, keyword, x, y, w, h): new_word = word.Word(len(self.field_list), x, y, w, h, 100, keyword) self.field_list.append(new_word)