예제 #1
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #2
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
         log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
             ID = mets_file_id(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #3
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                pcgts = from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(
                    pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(
                        component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(
                        RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(
                        TextRegionType(id=ID,
                                       Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    basename=ID + '.xml',
                    mimetype=MIMETYPE_PAGE,
                    content=to_xml(pcgts).encode('utf-8'),
                )
예제 #4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
             psm=PSM.SINGLE_LINE,
             path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(
                         image_url,
                         polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(
                             tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(
                             WordType(id=word_id,
                                      Coords=CoordsType(
                                          points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
예제 #5
0
 def write_alignment_to_xml(self):
     """
     Write the alignments into new output-file-group.
     The alignment is done by the master file (first index)
     """
     self.log.info("writing alignment to %s", self.process.output_file_grp)
     master = self.ifs[0]
     pcgts = from_file(self.process.workspace.download_file(master))
     ilist = iter(self.line_alignments)
     for region in pcgts.get_Page().get_TextRegion():
         for line in region.get_TextLine():
             self.log.info("line: %s", line.get_TextEquiv()[0].Unicode)
             line.get_TextEquiv()[0].set_index(0)
             current = next(ilist)
             self.add_line_alignments(line, current)
             self.add_word_alignments(line, current)
     self.log.debug("master basename: %s", master.basename)
     self.process.add_output_file(
         ID="{}_{}".format(master.ID, self.process.output_file_grp),
         mimetype=MIMETYPE_PAGE,
         content=to_xml(pcgts),
         file_grp=self.process.output_file_grp,
         basename=master.basename,
     )
예제 #6
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     if self.parameter['textequiv_level'] not in ['line', 'glyph']:
         raise Exception("currently only implemented at the line/glyph level")
     model = get_languages()[1][-1] # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model + " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model %s in %s for recognition", model, get_languages()[0])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
                     if self.parameter['textequiv_level'] == 'glyph':
                         for word in line.get_Word():
                             log.debug("Recognizing text in word '%s'", word.id)
                             xywh = xywh_from_points(word.get_Coords().points)
                             tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                             tessapi.SetPageSegMode(PSM.SINGLE_WORD)
                             word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                             result_it = tessapi.GetIterator()
                             for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)):
                                 #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice?
                                 #conf = result.Confidence(RIL.SYMBOL) # is first choice?
                                 bbox = result.BoundingBox(RIL.SYMBOL)
                                 if bbox == None:
                                     continue
                                 glyph_id = '%s_glyph%04d' % (word.id, result_no)
                                 log.debug("Recognizing text in glyph '%s'", glyph_id)
                                 glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox)))
                                 word.add_Glyph(glyph)
                                 choice_it = result.GetChoiceIterator()
                                 for (choice_no, choice) in enumerate(choice_it):
                                     alternative_symb = choice.GetUTF8Text()
                                     alternative_conf = choice.Confidence()
                                     glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb))
             ID = concat_padded(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts),
             )
예제 #7
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     # print(self.parameter)
     log.debug("TESSDATA: %s, installed tesseract models: %s",
               *get_languages())
     maxlevel = self.parameter['textequiv_level']
     model = get_languages()[1][-1]  # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model +
                             " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model '%s' in %s for recognition at the %s level",
                  model,
                  get_languages()[0], maxlevel)
         tessapi.SetVariable(
             "glyph_confidences",
             "2")  # populate GetChoiceIterator() with LSTM models, too
         # tessapi.SetVariable("tessedit_single_match", "0")
         #
         # tessedit_load_sublangs
         # tessedit_preserve_min_wd_len 2
         # tessedit_prefer_joined_punct 0
         # tessedit_write_rep_codes 0
         # tessedit_parallelize 0
         # tessedit_zero_rejection 0
         # tessedit_zero_kelvin_rejection 0
         # tessedit_reject_mode 0
         # tessedit_use_reject_spaces 1
         # tessedit_fix_fuzzy_spaces 1
         # tessedit_char_blacklist
         # tessedit_char_whitelist
         # chs_leading_punct ('`"
         # chs_trailing_punct1 ).,;:?!
         # chs_trailing_punct2 )'`"
         # numeric_punctuation .,
         # unrecognised_char |
         # ok_repeated_ch_non_alphanum_wds -?*=
         # conflict_set_I_l_1 Il1[]
         # preserve_interword_spaces 0
         # tessedit_enable_dict_correction 0
         # tessedit_enable_bigram_correction 1
         # stopper_smallword_size 2
         # wordrec_max_join_chunks 4
         # suspect_space_level 100
         # suspect_short_words 2
         # language_model_ngram_on 0
         # language_model_ngram_order 8
         # language_model_min_compound_length 3
         # language_model_penalty_non_freq_dict_word 0.1
         # language_model_penalty_non_dict_word 0.15
         # language_model_penalty_punc 0.2
         # language_model_penalty_case 0.1
         # language_model_penalty_script 0.5
         # language_model_penalty_chartype 0.3
         # language_model_penalty_spacing 0.05
         # textord_max_noise_size 7
         # enable_noise_removal 1
         # classify_bln_numeric_mode 0
         # lstm_use_matrix 1
         # user_words_file
         # user_patterns_file
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(
                 pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             metadata = pcgts.get_Metadata()  # ensured by from_file()
             metadata.add_MetadataItem(
                 MetadataItemType(
                     type_="processingStep",
                     name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']
                     ['steps'][0],
                     value='ocrd-tesserocr-recognize',
                     Labels=[
                         LabelsType(externalRef="parameters",
                                    Label=[
                                        LabelType(
                                            type_=name,
                                            value=self.parameter[name])
                                        for name in self.parameter.keys()
                                    ])
                     ]))
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId())
             page = pcgts.get_Page()
             if maxlevel == 'page':
                 # not sure what to do here:
                 # - We cannot simply do GetUTF8Text(), because there is no TextEquiv on the page level.
                 # - We could GetComponentImages(RIL.BLOCK) and add a text region for each, then enter region level recognition below. But what if regions are already annotated? How to go about non-text blocks?
                 raise Exception(
                     "currently only implemented below the page level")
             ## region, line, word, or glyph level:
             regions = page.get_TextRegion()
             if not regions:
                 log.warning("Page contains no text regions")
             self._process_regions(regions, maxlevel, tessapi)
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts),
             )