def test_xywh_from_points(self): self.assertEqual(xywh_from_points('100,100 200,100 200,200 100,200'), { 'x': 100, 'y': 100, 'w': 100, 'h': 100 })
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def _process_existing_words(self, words, maxlevel, tessapi): for word in words: log.debug("Recognizing text in word '%s'", word.id) word_xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(word_xywh['x'], word_xywh['y'], word_xywh['w'], word_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) if word.get_TextEquiv(): log.warning("Word '%s' already contains text results", word.id) word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0] / 100.0 if word_conf else 0.0 word.add_TextEquiv( TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), conf=word_conf)) if maxlevel == 'word': # maybe add TextEquiv alternatives via ChoiceIterator for WORD? continue # next word (to avoid indentation below) ## glyph level: glyphs = word.get_Glyph() if glyphs: ## external glyph layout: # raise Exception("existing annotation for Glyph level would clash with OCR results for word '%s'", word.id) # forcing external layout annotation for gylphs is worse with Tesseract log.warning( "Word '%s' contains glyphs already, recognition might be suboptimal", word.id) self._process_existing_glyphs(glyphs, tessapi) else: ## internal glyph layout: self._process_glyphs_in_word(word, tessapi.GetIterator())
def _process_lines(self, textlines, maxlevel, tessapi): for line in textlines: log.debug("Recognizing text in line '%s'", line.id) line_xywh = xywh_from_points(line.get_Coords().points) # log.debug("xywh: %s", line_xywh) tessapi.SetRectangle(line_xywh['x'], line_xywh['y'], line_xywh['w'], line_xywh['h']) tessapi.SetPageSegMode( PSM.SINGLE_LINE ) # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models if line.get_TextEquiv(): log.warning("Line '%s' already contains text results", line.id) # tessapi.G line_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too # add line annotation unconditionally (i.e. even for word or glyph level): line.add_TextEquiv( TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), conf=line_conf)) if maxlevel == 'line': # maybe add TextEquiv alternatives via ChoiceIterator for TEXTLINE? continue # next line (to avoid indentation below) ## word, or glyph level: words = line.get_Word() if words: ## external word layout: # raise Exception("existing annotation for Word level would clash with OCR results for line '%s'", line.id) # forcing external layout annotation for words or glyphs is worse with Tesseract log.warning( "Line '%s' contains words already, recognition might be suboptimal", line.id) self._process_existing_words(words, maxlevel, tessapi) else: ## internal word and glyph layout: self._process_words_in_line(line, maxlevel, tessapi)
def process(self): """ Performs the (text) recognition. """ print(self.parameter) with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi: log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) ID = mets_file_id(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def test_xywh_from_points_unordered(self): self.assertEqual( xywh_from_points('500,500 100,100 200,100 200,200 100,200'), { 'x': 100, 'y': 100, 'w': 400, 'h': 400 })
def _process_existing_glyphs(self, glyphs, tessapi): for glyph in glyphs: log.debug("Recognizing glyph in word '%s'", glyph.id) glyph_xywh = xywh_from_points(glyph.get_Coords().points) tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'], glyph_xywh['w'], glyph_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) if glyph.get_TextEquiv(): log.warning("Glyph '%s' already contains text results", glyph.id) glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0 glyph.add_TextEquiv( TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), conf=glyph_conf))
def _process_regions(self, regions, maxlevel, tessapi): for region in regions: log.debug("Recognizing text in region '%s'", region.id) if maxlevel == 'region': region_xywh = xywh_from_points(region.get_Coords().points) tessapi.SetRectangle(region_xywh['x'], region_xywh['y'], region_xywh['w'], region_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) if region.get_TextEquiv(): log.warning("Region '%s' already contains text results", region.id) region.add_TextEquiv( TextEquivType( Unicode=tessapi.GetUTF8Text().rstrip("\n\f"))) continue # next region (to avoid indentation below) ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: log.warning("Region '%s' contains no text lines", region.id) else: self._process_lines(textlines, maxlevel, tessapi)
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate( tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word( WordType(id=word_id, Coords=CoordsType( points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the (text) recognition. """ print(self.parameter) if self.parameter['textequiv_level'] not in ['line', 'glyph']: raise Exception("currently only implemented at the line/glyph level") model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model %s in %s for recognition", model, get_languages()[0]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_LINE) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) if self.parameter['textequiv_level'] == 'glyph': for word in line.get_Word(): log.debug("Recognizing text in word '%s'", word.id) xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) result_it = tessapi.GetIterator() for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)): #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice? #conf = result.Confidence(RIL.SYMBOL) # is first choice? bbox = result.BoundingBox(RIL.SYMBOL) if bbox == None: continue glyph_id = '%s_glyph%04d' % (word.id, result_no) log.debug("Recognizing text in glyph '%s'", glyph_id) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox))) word.add_Glyph(glyph) choice_it = result.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_symb = choice.GetUTF8Text() alternative_conf = choice.Confidence() glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb)) ID = concat_padded(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )