예제 #1
0
 def test_xywh_from_points(self):
     self.assertEqual(xywh_from_points('100,100 200,100 200,200 100,200'), {
         'x': 100,
         'y': 100,
         'w': 100,
         'h': 100
     })
예제 #2
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #3
0
 def _process_existing_words(self, words, maxlevel, tessapi):
     for word in words:
         log.debug("Recognizing text in word '%s'", word.id)
         word_xywh = xywh_from_points(word.get_Coords().points)
         tessapi.SetRectangle(word_xywh['x'], word_xywh['y'],
                              word_xywh['w'], word_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_WORD)
         if word.get_TextEquiv():
             log.warning("Word '%s' already contains text results", word.id)
         word_conf = tessapi.AllWordConfidences()
         word_conf = word_conf[0] / 100.0 if word_conf else 0.0
         word.add_TextEquiv(
             TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"),
                           conf=word_conf))
         if maxlevel == 'word':
             # maybe add TextEquiv alternatives via ChoiceIterator for WORD?
             continue  # next word (to avoid indentation below)
         ## glyph level:
         glyphs = word.get_Glyph()
         if glyphs:
             ## external glyph layout:
             # raise Exception("existing annotation for Glyph level would clash with OCR results for word '%s'", word.id) # forcing external layout annotation for gylphs is worse with Tesseract
             log.warning(
                 "Word '%s' contains glyphs already, recognition might be suboptimal",
                 word.id)
             self._process_existing_glyphs(glyphs, tessapi)
         else:
             ## internal glyph layout:
             self._process_glyphs_in_word(word, tessapi.GetIterator())
예제 #4
0
 def _process_lines(self, textlines, maxlevel, tessapi):
     for line in textlines:
         log.debug("Recognizing text in line '%s'", line.id)
         line_xywh = xywh_from_points(line.get_Coords().points)
         #  log.debug("xywh: %s", line_xywh)
         tessapi.SetRectangle(line_xywh['x'], line_xywh['y'],
                              line_xywh['w'], line_xywh['h'])
         tessapi.SetPageSegMode(
             PSM.SINGLE_LINE
         )  # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models
         if line.get_TextEquiv():
             log.warning("Line '%s' already contains text results", line.id)
         #  tessapi.G
         line_conf = tessapi.MeanTextConf(
         ) / 100.0  # iterator scores are arithmetic averages, too
         # add line annotation unconditionally (i.e. even for word or glyph level):
         line.add_TextEquiv(
             TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"),
                           conf=line_conf))
         if maxlevel == 'line':
             # maybe add TextEquiv alternatives via ChoiceIterator for TEXTLINE?
             continue  # next line (to avoid indentation below)
         ## word, or glyph level:
         words = line.get_Word()
         if words:
             ## external word layout:
             # raise Exception("existing annotation for Word level would clash with OCR results for line '%s'", line.id) # forcing external layout annotation for words or glyphs is worse with Tesseract
             log.warning(
                 "Line '%s' contains words already, recognition might be suboptimal",
                 line.id)
             self._process_existing_words(words, maxlevel, tessapi)
         else:
             ## internal word and glyph layout:
             self._process_words_in_line(line, maxlevel, tessapi)
예제 #5
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
         log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
             ID = mets_file_id(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #6
0
 def test_xywh_from_points_unordered(self):
     self.assertEqual(
         xywh_from_points('500,500 100,100 200,100 200,200 100,200'), {
             'x': 100,
             'y': 100,
             'w': 400,
             'h': 400
         })
예제 #7
0
 def _process_existing_glyphs(self, glyphs, tessapi):
     for glyph in glyphs:
         log.debug("Recognizing glyph in word '%s'", glyph.id)
         glyph_xywh = xywh_from_points(glyph.get_Coords().points)
         tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'],
                              glyph_xywh['w'], glyph_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
         if glyph.get_TextEquiv():
             log.warning("Glyph '%s' already contains text results",
                         glyph.id)
         glyph_conf = tessapi.AllWordConfidences()
         glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0
         glyph.add_TextEquiv(
             TextEquivType(Unicode=tessapi.GetUTF8Text().rstrip("\n\f"),
                           conf=glyph_conf))
예제 #8
0
 def _process_regions(self, regions, maxlevel, tessapi):
     for region in regions:
         log.debug("Recognizing text in region '%s'", region.id)
         if maxlevel == 'region':
             region_xywh = xywh_from_points(region.get_Coords().points)
             tessapi.SetRectangle(region_xywh['x'], region_xywh['y'],
                                  region_xywh['w'], region_xywh['h'])
             tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
             if region.get_TextEquiv():
                 log.warning("Region '%s' already contains text results",
                             region.id)
             region.add_TextEquiv(
                 TextEquivType(
                     Unicode=tessapi.GetUTF8Text().rstrip("\n\f")))
             continue  # next region (to avoid indentation below)
         ## line, word, or glyph level:
         textlines = region.get_TextLine()
         if not textlines:
             log.warning("Region '%s' contains no text lines", region.id)
         else:
             self._process_lines(textlines, maxlevel, tessapi)
예제 #9
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
             psm=PSM.SINGLE_LINE,
             path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(
                         image_url,
                         polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(
                             tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(
                             WordType(id=word_id,
                                      Coords=CoordsType(
                                          points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #10
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     if self.parameter['textequiv_level'] not in ['line', 'glyph']:
         raise Exception("currently only implemented at the line/glyph level")
     model = get_languages()[1][-1] # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model + " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model %s in %s for recognition", model, get_languages()[0])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
                     if self.parameter['textequiv_level'] == 'glyph':
                         for word in line.get_Word():
                             log.debug("Recognizing text in word '%s'", word.id)
                             xywh = xywh_from_points(word.get_Coords().points)
                             tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                             tessapi.SetPageSegMode(PSM.SINGLE_WORD)
                             word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                             result_it = tessapi.GetIterator()
                             for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)):
                                 #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice?
                                 #conf = result.Confidence(RIL.SYMBOL) # is first choice?
                                 bbox = result.BoundingBox(RIL.SYMBOL)
                                 if bbox == None:
                                     continue
                                 glyph_id = '%s_glyph%04d' % (word.id, result_no)
                                 log.debug("Recognizing text in glyph '%s'", glyph_id)
                                 glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox)))
                                 word.add_Glyph(glyph)
                                 choice_it = result.GetChoiceIterator()
                                 for (choice_no, choice) in enumerate(choice_it):
                                     alternative_symb = choice.GetUTF8Text()
                                     alternative_conf = choice.Confidence()
                                     glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb))
             ID = concat_padded(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts),
             )