示例#1
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
示例#2
0
def _page_update_higher_textequiv_levels(level, pcgts):
    """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.

    Starting with the hierarchy level chosen for processing,
    join all first TextEquiv (by the rules governing the respective level)
    into TextEquiv of the next higher level, replacing them.
    """
    regions = pcgts.get_Page().get_TextRegion()
    if level != 'region':
        for region in regions:
            lines = region.get_TextLine()
            if level != 'line':
                for line in lines:
                    words = line.get_Word()
                    if level != 'word':
                        for word in words:
                            glyphs = word.get_Glyph()
                            word_unicode = u''.join(glyph.get_TextEquiv(
                            )[0].Unicode if glyph.get_TextEquiv() else u''
                                                    for glyph in glyphs)
                            word.set_TextEquiv([
                                TextEquivType(Unicode=word_unicode)
                            ])  # remove old
                    line_unicode = u' '.join(word.get_TextEquiv(
                    )[0].Unicode if word.get_TextEquiv() else u''
                                             for word in words)
                    line.set_TextEquiv([TextEquivType(Unicode=line_unicode)
                                        ])  # remove old
            region_unicode = u'\n'.join(line.get_TextEquiv(
            )[0].Unicode if line.get_TextEquiv() else u'' for line in lines)
            region.set_TextEquiv([TextEquivType(Unicode=region_unicode)
                                  ])  # remove old
示例#3
0
文件: clean.py 项目: jacektl/ocrd_cis
    def process(self):
        """
        Performs the (text) recognition.
        """

        mainIndex = self.parameter['mainIndex']

        for (n, input_file) in enumerate(self.input_files):

            alignurl = input_file.url
            pcgts = parse(alignurl, True)
            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            pagecontent = ''
            for region in regions:
                regioncontent = ''

                lines = region.get_TextLine()
                for line in lines:
                    linecontent = ''

                    words = line.get_Word()
                    for word in words:
                        wordunicode = word.get_TextEquiv()[mainIndex].Unicode
                        word.add_TextEquiv(TextEquivType(Unicode=wordunicode))
                        linecontent += ' ' + wordunicode

                    line.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                    regioncontent += '\n' + linecontent

                region.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                pagecontent += '\n' + regioncontent

            page.add_TextEquiv(TextEquivType(Unicode=pagecontent))

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)
            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
示例#4
0
 def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     for glyph in glyphs:
         glyph_image, _ = self.workspace.image_from_segment(
             glyph, word_image, word_xywh)
         if self.parameter['padding']:
             tessapi.SetImage(pad_image(glyph_image, self.parameter['padding']))
         else:
             tessapi.SetImage(glyph_image)
         tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
         LOG.debug("Recognizing text in glyph '%s'", glyph.id)
         if glyph.get_TextEquiv():
             LOG.warning("Glyph '%s' already contained text results", glyph.id)
             glyph.set_TextEquiv([])
         #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f")
         glyph_conf = tessapi.AllWordConfidences()
         glyph_conf = glyph_conf[0]/100.0 if glyph_conf else 1.0
         #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         result_it = tessapi.GetIterator()
         if not result_it or result_it.Empty(RIL.SYMBOL):
             LOG.error("No text in glyph '%s'", glyph.id)
             continue
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence()/100
             #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or
                 choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
示例#5
0
 def _process_existing_words(self, tessapi, words, line_image, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     for word in words:
         word_image, word_xywh = self.workspace.image_from_segment(
             word, line_image, line_xywh)
         if self.parameter['padding']:
             tessapi.SetImage(pad_image(word_image, self.parameter['padding']))
         else:
             tessapi.SetImage(word_image)
         tessapi.SetPageSegMode(PSM.SINGLE_WORD)
         if self.parameter['textequiv_level'] == 'word':
             LOG.debug("Recognizing text in word '%s'", word.id)
             word_text = tessapi.GetUTF8Text().rstrip("\n\f")
             word_conf = tessapi.AllWordConfidences()
             word_conf = word_conf[0]/100.0 if word_conf else 0.0
             if word.get_TextEquiv():
                 LOG.warning("Word '%s' already contained text results", word.id)
                 word.set_TextEquiv([])
             # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle())
             word.add_TextEquiv(TextEquivType(Unicode=word_text, conf=word_conf))
             continue # next word (to avoid indentation below)
         ## glyph level:
         glyphs = word.get_Glyph()
         if glyphs:
             ## external glyph layout:
             LOG.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id)
             self._process_existing_glyphs(tessapi, glyphs, word_image, word_xywh)
         else:
             ## internal glyph layout:
             tessapi.Recognize()
             self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh)
示例#6
0
 def _process_regions(self, tessapi, regions, page_image, page_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     for region in regions:
         region_image, region_xywh = self.workspace.image_from_segment(
             region, page_image, page_xywh)
         if self.parameter['textequiv_level'] == 'region':
             if self.parameter['padding']:
                 tessapi.SetImage(pad_image(region_image, self.parameter['padding']))
             else:
                 tessapi.SetImage(region_image)
             tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
             #if region.get_primaryScript() not in tessapi.GetLoadedLanguages()...
             LOG.debug("Recognizing text in region '%s'", region.id)
             region_text = tessapi.GetUTF8Text().rstrip("\n\f")
             region_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too
             if region.get_TextEquiv():
                 LOG.warning("Region '%s' already contained text results", region.id)
                 region.set_TextEquiv([])
             # todo: consider SetParagraphSeparator
             region.add_TextEquiv(TextEquivType(Unicode=region_text, conf=region_conf))
             continue # next region (to avoid indentation below)
         ## line, word, or glyph level:
         textlines = region.get_TextLine()
         if not textlines:
             LOG.warning("Region '%s' contains no text lines", region.id)
         else:
             self._process_lines(tessapi, textlines, region_image, region_xywh)
示例#7
0
def combine_windows_to_graph(windows):
    '''
    Combine windows FSTs containing hypotheses for given windows to a
    graph of hypotheses in `nx.DiGraph` format, with decoding
    alternatives represented as `TextEquivType` at the edges. This is
    suitable for decoding data supplied in PageXML input format.

    The windows are passed as a dictionary:
        (starting_position, length) -> window_fst
    '''
    graph = nx.DiGraph()
    line_end_node = max(i + j for i, j in windows)
    graph.add_nodes_from(range(line_end_node + 1))
    for (i, j), fst in windows.items():
        start_node = i
        end_node = i + j
        paths = [(output_str, float(weight)) \
                 for input_str, output_str, weight in \
                     fst.paths().items()]
        if paths:
            for path in paths:
                logging.debug('({}, {}, \'{}\', {})'.format(\
                        start_node, end_node, path[0], pow(2, -path[1])))
            graph.add_edge(
                start_node, end_node, element=None,
                alternatives=[
                    TextEquivType(Unicode=path[0], conf=pow(2, -path[1])) \
                    for path in paths \
                ])
        else:
            logging.warning('No path from {} to {}.'.format(i, i + j))
    return graph
示例#8
0
 def _process_lines(self, textlines, maxlevel, tessapi):
     for line in textlines:
         log.debug("Recognizing text in line '%s'", line.id)
         line_xywh = xywh_from_points(line.get_Coords().points)
         #  log.debug("xywh: %s", line_xywh)
         tessapi.SetRectangle(line_xywh['x'], line_xywh['y'],
                              line_xywh['w'], line_xywh['h'])
         tessapi.SetPageSegMode(
             PSM.SINGLE_LINE
         )  # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models
         if maxlevel == 'line':
             line_text = tessapi.GetUTF8Text().rstrip("\n\f")
             line_conf = tessapi.MeanTextConf(
             ) / 100.0  # iterator scores are arithmetic averages, too
             if line.get_TextEquiv():
                 log.warning("Line '%s' already contained text results",
                             line.id)
                 line.set_TextEquiv([])
             # todo: consider BlankBeforeWord, SetLineSeparator
             line.add_TextEquiv(
                 TextEquivType(Unicode=line_text, conf=line_conf))
             continue  # next line (to avoid indentation below)
         ## word, or glyph level:
         words = line.get_Word()
         if words:
             ## external word layout:
             log.warning(
                 "Line '%s' contains words already, recognition might be suboptimal",
                 line.id)
             self._process_existing_words(words, maxlevel, tessapi)
         else:
             ## internal word and glyph layout:
             tessapi.Recognize()
             self._process_words_in_line(line, maxlevel,
                                         tessapi.GetIterator())
示例#9
0
def set_text(node, text, page_textequiv_strategy):
    """
    Set the first or most confident among text results (depending on ``page_textequiv_strategy``).
    For the strategy ``best``, set the string of the highest scoring result.
    For the strategy ``first``, set the string of the lowest indexed result.
    If there are no scores/indexes, use the first result.
    If there are no results, add a new one.
    """
    text = text.strip()
    textEquivs = node.get_TextEquiv()
    if not textEquivs:
        node.add_TextEquiv(TextEquivType(Unicode=text))  # or index=0 ?
    elif page_textequiv_strategy == 'best':
        if len(textEquivs) > 1:
            textEquivsSorted = sorted(
                [x for x in textEquivs if x.conf],
                # generateDS does not convert simpleType for attributes (yet?)
                key=lambda x: float(x.conf))
            if textEquivsSorted:
                textEquivsSorted[-1].set_Unicode(text)
                return
        # fall back to first element
        textEquivs[0].set_Unicode(text)
    #elif page_textequiv_strategy == 'first':
    else:
        if len(textEquivs) > 1:
            textEquivsSorted = sorted(
                [x for x in textEquivs if isinstance(x.index, int)],
                key=lambda x: x.index)
            if textEquivsSorted:
                textEquivsSorted[0].set_Unicode(text)
                return
        # fall back to first element
        textEquivs[0].set_Unicode(text)
示例#10
0
 def _process_existing_words(self, words, maxlevel, tessapi):
     for word in words:
         log.debug("Recognizing text in word '%s'", word.id)
         word_xywh = xywh_from_points(word.get_Coords().points)
         tessapi.SetRectangle(word_xywh['x'], word_xywh['y'],
                              word_xywh['w'], word_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_WORD)
         if maxlevel == 'word':
             word_text = tessapi.GetUTF8Text().rstrip("\n\f")
             word_conf = tessapi.AllWordConfidences()
             word_conf = word_conf[0] / 100.0 if word_conf else 0.0
             if word.get_TextEquiv():
                 log.warning("Word '%s' already contained text results",
                             word.id)
                 word.set_TextEquiv([])
             # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle())
             word.add_TextEquiv(
                 TextEquivType(Unicode=word_text, conf=word_conf))
             continue  # next word (to avoid indentation below)
         ## glyph level:
         glyphs = word.get_Glyph()
         if glyphs:
             ## external glyph layout:
             log.warning(
                 "Word '%s' contains glyphs already, recognition might be suboptimal",
                 word.id)
             self._process_existing_glyphs(glyphs, tessapi)
         else:
             ## internal glyph layout:
             tessapi.Recognize()
             self._process_glyphs_in_word(word, tessapi.GetIterator())
示例#11
0
 def _combine_windows_to_line_graph(self, windows):
     LOG = getLogger('processor.FSTCorrection')
     graph = nx.DiGraph()
     line_end_node = max(i+j for i, j in windows)
     graph.add_nodes_from(range(line_end_node + 1))
     for (i, j), (ref, fst, tokens) in windows.items():
         start_node = i
         end_node = i + j
         # FIXME: this will NOT work without spaces and newlines (as before 81dd2c0c)!
         paths = [(output_str, float(weight)) \
                  for input_str, output_str, weight in \
                      fst.paths().items()]
         if paths:
             for path in paths:
                 LOG.info('({}, {}, \'{}\', {})'.format(\
                     start_node, end_node, path[0], pow(2, -path[1])))
             graph.add_edge(
                 start_node, end_node, element=ref,
                 alternatives=list(map(
                     lambda path:
                         TextEquivType(Unicode=path[0],
                                       conf=pow(2, -path[1])),
                     paths)))
         else:
             LOG.warning('No path from {} to {}.'.format(i, i+j))
     return graph
示例#12
0
 def _process_existing_glyphs(self, glyphs, tessapi):
     for glyph in glyphs:
         log.debug("Recognizing glyph in word '%s'", glyph.id)
         glyph_xywh = xywh_from_points(glyph.get_Coords().points)
         tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'],
                              glyph_xywh['w'], glyph_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
         if glyph.get_TextEquiv():
             log.warning("Glyph '%s' already contained text results",
                         glyph.id)
             glyph.set_TextEquiv([])
         #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f")
         glyph_conf = tessapi.AllWordConfidences()
         glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0
         #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         result_it = tessapi.GetIterator()
         if not result_it or result_it.Empty(RIL.SYMBOL):
             log.error("No glyph here")
             continue
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence() / 100
             #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF
                     or choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(
                 TextEquivType(index=choice_no,
                               Unicode=alternative_text,
                               conf=alternative_conf))
示例#13
0
    def process_regions(self, regions, maxlevel, page_image, page_coords):
        edits = 0
        lengs = 0
        for region in regions:
            region_image, region_coords = self.workspace.image_from_segment(
                region, page_image, page_coords)

            self.logger.info("Recognizing text in region '%s'", region.id)
            textlines = region.get_TextLine()
            if not textlines:
                self.logger.warning("Region '%s' contains no text lines",
                                    region.id)
            else:
                edits_, lengs_ = self.process_lines(textlines, maxlevel,
                                                    region_image,
                                                    region_coords)
                edits += edits_
                lengs += lengs_
            # update region text by concatenation for consistency
            region_unicode = u'\n'.join(line.get_TextEquiv(
            )[0].Unicode if line.get_TextEquiv() else u''
                                        for line in textlines)
            region.set_TextEquiv([TextEquivType(Unicode=region_unicode)])
        if lengs > 0:
            self.logger.info('CER: %.1f%%', 100.0 * edits / lengs)
示例#14
0
文件: decode.py 项目: kba/cor-asv-fst
 def _combine_windows_to_line_graph(self, windows):
     graph = nx.DiGraph()
     line_end_node = max(i + j for i, j in windows)
     graph.add_nodes_from(range(line_end_node + 1))
     for (i, j), (ref, fst, tokens) in windows.items():
         start_node = i
         end_node = i + j
         paths = [(output_str, float(weight)) \
                  for input_str, output_str, weight in \
                      fst.paths().items()]
         if paths:
             for path in paths:
                 LOG.info('({}, {}, \'{}\', {})'.format(\
                     start_node, end_node, path[0], pow(2, -path[1])))
             graph.add_edge(
                 start_node,
                 end_node,
                 element=ref,
                 alternatives=list(
                     map(
                         lambda path: TextEquivType(Unicode=path[0],
                                                    conf=pow(2, -path[1])),
                         paths)))
         else:
             LOG.warning('No path from {} to {}.'.format(i, i + j))
     return graph
示例#15
0
    def test_validate_multi_textequiv_index1(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True)
        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        del(word.get_TextEquiv()[0])

        # Add textequiv
        set_text(word, 'FOO', 'index1')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))
        word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=1))
        self.assertEqual(get_text(word, 'index1'), 'BAZ')
        set_text(word, 'XYZ', 'index1')
        self.assertEqual(get_text(word, 'index1'), 'XYZ')
    def test_validate_multi_textequiv_first(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True)
        report = PageValidator.validate(ocrd_page=ocrd_page)
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        word.set_TextEquiv([])

        # Add textequiv
        set_text(word, 'FOO', 'first')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))
        word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0))
        self.assertEqual(get_text(word, 'first'), 'BAZ')
        set_text(word, 'XYZ', 'first')
        self.assertEqual(get_text(word, 'first'), 'XYZ')
示例#17
0
 def _process_words_in_line(self, result_it, line, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.WORD):
         LOG.warning("No text in line '%s'", line.id)
         return
     # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
     word_no = 0
     while result_it and not result_it.Empty(RIL.WORD):
         word_id = '%s_word%04d' % (line.id, word_no)
         LOG.debug("Decoding text in word '%s'", word_id)
         bbox = result_it.BoundingBox(RIL.WORD)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, line_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, line)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         word = WordType(id=word_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant word: %s', points)
         else:
             line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(TextEquivType(
             Unicode=result_it.GetUTF8Text(RIL.WORD),
             conf=result_it.Confidence(RIL.WORD)/100))
         if self.parameter['textequiv_level'] != 'word':
             self._process_glyphs_in_word(result_it, word, line_xywh)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             word_no += 1
             result_it.Next(RIL.WORD)
示例#18
0
 def _process_lines(self, tessapi, textlines, region_image, region_xywh):
     for line in textlines:
         if self.parameter['overwrite_words']:
             line.set_Word([])
         line_image, line_xywh = self.workspace.image_from_segment(
             line, region_image, region_xywh)
         # todo: Tesseract works better if the line images have a 5px margin everywhere
         if self.parameter['padding']:
             bg = tuple(ImageStat.Stat(line_image).median)
             pad = self.parameter['padding']
             padded = Image.new(
                 line_image.mode,
                 (line_image.width + 2 * pad, line_image.height + 2 * pad),
                 bg)
             padded.paste(line_image, (pad, pad))
             tessapi.SetImage(padded)
         else:
             tessapi.SetImage(line_image)
         if self.parameter['raw_lines']:
             tessapi.SetPageSegMode(PSM.RAW_LINE)
         else:
             tessapi.SetPageSegMode(PSM.SINGLE_LINE)
         #if line.get_primaryScript() not in tessapi.GetLoadedLanguages()...
         LOG.debug("Recognizing text in line '%s'", line.id)
         if self.parameter['textequiv_level'] == 'line':
             line_text = tessapi.GetUTF8Text().rstrip("\n\f")
             line_conf = tessapi.MeanTextConf(
             ) / 100.0  # iterator scores are arithmetic averages, too
             if line.get_TextEquiv():
                 LOG.warning("Line '%s' already contained text results",
                             line.id)
                 line.set_TextEquiv([])
             # todo: consider BlankBeforeWord, SetLineSeparator
             line.add_TextEquiv(
                 TextEquivType(Unicode=line_text, conf=line_conf))
             continue  # next line (to avoid indentation below)
         ## word, or glyph level:
         words = line.get_Word()
         if words:
             ## external word layout:
             LOG.warning(
                 "Line '%s' contains words already, recognition might be suboptimal",
                 line.id)
             self._process_existing_words(tessapi, words, line_image,
                                          line_xywh)
         else:
             ## internal word and glyph layout:
             tessapi.Recognize()
             self._process_words_in_line(tessapi.GetIterator(), line,
                                         line_xywh)
示例#19
0
 def _process_words_in_line(self, line, maxlevel, result_it):
     for word_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD)
         if not result_it:
             log.error("No iterator at '%s'", line.id)
             break
         if result_it.Empty(RIL.WORD):
             log.debug("No word here")
             break
         word_id = '%s_word%04d' % (line.id, word_no)
         log.debug("Recognizing text in word '%s'", word_id)
         word_bbox = result_it.BoundingBox(RIL.WORD)
         word = WordType(id=word_id,
                         Coords=CoordsType(points_from_x0y0x1y1(word_bbox)))
         line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=None if 'bold' not in word_attributes else
                 word_attributes['bold'],
                 italic=None if 'italic' not in word_attributes else
                 word_attributes['italic'],
                 underlined=None if 'underlined' not in word_attributes else
                 word_attributes['underlined'],
                 monospace=None if 'monospace' not in word_attributes else
                 word_attributes['monospace'],
                 serif=None if 'serif' not in word_attributes else
                 word_attributes['serif'])
             word.set_TextStyle(
                 word_style)  # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(
             TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD),
                           conf=result_it.Confidence(RIL.WORD) / 100))
         if maxlevel == 'word':
             pass
         else:
             self._process_glyphs_in_word(word, result_it)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             result_it.Next(RIL.WORD)
示例#20
0
def set_text(node, text, strategy):
    """
    Set the most confident text results, either those with @index = 1, the first text results or add new one.
    """
    text = text.strip()
    textEquivs = node.get_TextEquiv()
    if not textEquivs:
        node.add_TextEquiv(TextEquivType(Unicode=text))
    #  elif strategy == 'index1':
    else:
        if len(textEquivs) > 1:
            index1 = [x for x in textEquivs if x.index == 1]
            if index1:
                index1[0].set_Unicode(text)
                return
        textEquivs[0].set_Unicode(text)
示例#21
0
 def _process_regions(self, regions, maxlevel, tessapi):
     for region in regions:
         log.debug("Recognizing text in region '%s'", region.id)
         # todo: determine if and how this can still be used for region classification:
         # result_it = tessapi.GetIterator()
         # if not result_it or result_it.Empty(RIL.BLOCK)
         # ptype = result_it.BlockType()
         # PT.UNKNOWN
         # PT.FLOWING_TEXT
         # PT.HEADING_TEXT
         # PT.PULLOUT_TEXT
         # PT.EQUATION
         # PT.TABLE
         # PT.VERTICAL_TEXT
         # PT.CAPTION_TEXT
         # PT.HORZ_LINE
         # PT.VERT_LINE
         # PT.NOISE
         # PT.COUNT
         # ...
         if maxlevel == 'region':
             region_xywh = xywh_from_points(region.get_Coords().points)
             tessapi.SetRectangle(region_xywh['x'], region_xywh['y'],
                                  region_xywh['w'], region_xywh['h'])
             tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
             region_text = tessapi.GetUTF8Text().rstrip("\n\f")
             region_conf = tessapi.MeanTextConf(
             ) / 100.0  # iterator scores are arithmetic averages, too
             if region.get_TextEquiv():
                 log.warning("Region '%s' already contained text results",
                             region.id)
                 region.set_TextEquiv([])
             # todo: consider SetParagraphSeparator
             region.add_TextEquiv(
                 TextEquivType(Unicode=region_text, conf=region_conf))
             continue  # next region (to avoid indentation below)
         ## line, word, or glyph level:
         textlines = region.get_TextLine()
         if not textlines:
             log.warning("Region '%s' contains no text lines", region.id)
         else:
             self._process_lines(textlines, maxlevel, tessapi)
示例#22
0
 def _process_glyphs_in_word(self, result_it, word, word_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.SYMBOL):
         LOG.debug("No glyph in word '%s'", word.id)
         return
     # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
     glyph_no = 0
     while result_it and not result_it.Empty(RIL.SYMBOL):
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         LOG.debug("Decoding text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice?
         #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         bbox = result_it.BoundingBox(RIL.SYMBOL)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, word_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, word)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         glyph = GlyphType(id=glyph_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant glyph: %s', points)
         else:
             word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence()/100
             #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or
                 choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             glyph_no += 1
             result_it.Next(RIL.SYMBOL)
示例#23
0
def _add_space(graph, start_node, space, last_start_node, problem, textequivs):
    """add a pseudo-element edge for the white-space string `space` to `graph`,
    between `start_node` and new node `start_node`+1, except if there is a
    tokenisation `problem` involving the first textequiv in the graph's current tip"""
    # tokenisation inconsistency does not apply if:
    # - element id not contained in detected problem set
    # - there is no TextEquiv to compare with at the next token
    # - the element is first of its kind (i.e. must not start with white space anyway)
    if (textequivs and textequivs[0].Unicode and problem
            and _repair_tokenisation(
                problem.actual, u''.join(
                    map(lambda x: x['alternatives'][0].Unicode,
                        _get_edges(graph, last_start_node))),
                textequivs[0].Unicode)):
        pass  # skip all rules for concatenation joins
    else:  # joining space required for LM input here?
        start_node = _add_element(graph, start_node, None,
                                  [TextEquivType(Unicode=space, conf=1.0)])
        # LM output will not appear in annotation
        # (so conf cannot be combined to accurate perplexity from output)
    return start_node
示例#24
0
 def _process_glyphs_in_word(self, word, result_it):
     for glyph_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL)
         if not result_it:
             log.error("No iterator at '%s'", word.id)
             break
         if result_it.Empty(RIL.SYMBOL):
             log.debug("No glyph here")
             break
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         log.debug("Recognizing text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(
             RIL.SYMBOL) / 100  # equals first choice?
         #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         glyph_bbox = result_it.BoundingBox(RIL.SYMBOL)
         glyph = GlyphType(id=glyph_id,
                           Coords=CoordsType(
                               points_from_x0y0x1y1(glyph_bbox)))
         word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence() / 100
             #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF
                     or choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(
                 TextEquivType(index=choice_no,
                               Unicode=alternative_text,
                               conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             result_it.Next(RIL.SYMBOL)
示例#25
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged
示例#26
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
示例#27
0
def _page_get_line_sequences_at(level, pcgts):
    '''Get TextEquiv sequences for PAGE-XML hierarchy level including whitespace.
    
    Return a list of lines from the document `pcgts`,
    where each line is a list of 3-tuples containing
    TextEquiv / Word / TextLine objects from the given
    hierarchy `level`. This includes artificial objects
    for implicit whitespace between elements (marked by
    `index=-1`, which is forbidden in the XML Schema).
    
    (If `level` is `glyph`, then the Word reference
     will be the Word that contains the Glyph which
     contains the TextEquiv.
     If `level` is `word`, then the Word reference
     will be the Word which contains the TextEquiv.
     If `level` is `line`, then the Word reference
     will be None.)
    '''
    LOG = getLogger('processor.ANNCorrection')
    sequences = list()
    word = None  # make accessible after loop
    line = None  # make accessible after loop
    regions = pcgts.get_Page().get_AllRegions(classes=['Text'],
                                              order='reading-order')
    if not regions:
        LOG.warning("Page contains no text regions")
    for region in regions:
        lines = region.get_TextLine()
        if not lines:
            LOG.warning("Region '%s' contains no text lines", region.id)
        for line in lines:
            sequences.append([])
            if level == 'line':
                #LOG.debug("Getting text in line '%s'", line.id)
                textequivs = line.get_TextEquiv()
                if textequivs:
                    sequences[-1].append((textequivs[0], word, line))
                else:
                    LOG.warning("Line '%s' contains no text results", line.id)
            else:
                words = line.get_Word()
                if not words:
                    LOG.warning("Line '%s' contains no word", line.id)
                for word in words:
                    if level == 'word':
                        #LOG.debug("Getting text in word '%s'", word.id)
                        textequivs = word.get_TextEquiv()
                        if textequivs:
                            sequences[-1].append((textequivs[0], word, line))
                        else:
                            LOG.warning("Word '%s' contains no text results",
                                        word.id)
                            continue  # no inter-word
                    else:
                        glyphs = word.get_Glyph()
                        if not glyphs:
                            LOG.warning("Word '%s' contains no glyphs",
                                        word.id)
                            continue  # no inter-word
                        for glyph in glyphs:
                            #LOG.debug("Getting text in glyph '%s'", glyph.id)
                            textequivs = glyph.get_TextEquiv()
                            if textequivs:
                                sequences[-1].append(
                                    (textequivs[0], word, line))
                            else:
                                LOG.warning(
                                    "Glyph '%s' contains no text results",
                                    glyph.id)
                                # treat as gap
                                textequivs = [
                                    TextEquivType(Unicode='', conf=1.0)
                                ]
                                glyph.set_TextEquiv(textequivs)
                                sequences[-1].append(
                                    (textequivs[0], word, line))
                    sequences[-1].append((TextEquivType(Unicode=' ',
                                                        conf=1.0,
                                                        index=-1), word, line))
                if sequences[-1]:
                    sequences[-1].pop()  # no inter-word
            sequences[-1].append((TextEquivType(Unicode='\n',
                                                conf=1.0,
                                                index=-1), word, line))
    # filter empty lines (containing only newline):
    return [line for line in sequences if len(line) > 1]
示例#28
0
def page_update_higher_textequiv_levels(level, pcgts):
    """Update the TextEquivs of all PAGE-XML hierarchy levels above ``level`` for consistency.
    
    Starting with the hierarchy level chosen for processing,
    join all first TextEquiv.Unicode (by the rules governing the respective level)
    into TextEquiv.Unicode of the next higher level, replacing them.
    
    When two successive elements appear in a ``Relation`` of type ``join``,
    then join them directly (without their respective white space).
    
    Likewise, average all first TextEquiv.conf into TextEquiv.conf of the next higher level.
    
    In the process, traverse the words and lines in their respective ``readingDirection``,
    the (text) regions which contain lines in their respective ``textLineOrder``, and
    the (text) regions which contain text regions in their ``ReadingOrder``
    (if they appear there as an ``OrderedGroup``).
    Where no direction/order can be found, use XML ordering.
    
    Follow regions recursively, but make sure to traverse them in a depth-first strategy.
    """
    page = pcgts.get_Page()
    relations = page.get_Relations()  # get RelationsType
    if relations:
        relations = relations.get_Relation()  # get list of RelationType
    else:
        relations = []
    joins = list()  #
    for relation in relations:
        if relation.get_type() == 'join':  # ignore 'link' type here
            joins.append((relation.get_SourceRegionRef().get_regionRef(),
                          relation.get_TargetRegionRef().get_regionRef()))
    reading_order = dict()
    ro = page.get_ReadingOrder()
    if ro:
        page_get_reading_order(
            reading_order,
            ro.get_OrderedGroup() or ro.get_UnorderedGroup())
    if level != 'region':
        for region in itertools.chain.from_iterable(
                # order is important here, because regions can be recursive,
                # and we want to concatenate by depth first;
                # typical recursion structures would be:
                #  - TextRegion/@type=paragraph inside TextRegion
                #  - TextRegion/@type=drop-capital followed by TextRegion/@type=paragraph inside TextRegion
                #  - any region (including TableRegion or TextRegion) inside a TextRegion/@type=footnote
                #  - TextRegion inside TableRegion
            [
                subregion.get_TextRegion()
                for subregion in page.get_TextRegion()
            ] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ] + [page.get_TextRegion()]):
            subregions = region.get_TextRegion()
            if subregions:  # already visited in earlier iterations
                # do we have a reading order for these?
                # TODO: what if at least some of the subregions are in reading_order?
                if (all(subregion.id in reading_order
                        for subregion in subregions) and isinstance(
                            reading_order[
                                subregions[0].id],  # all have .index?
                            (OrderedGroupType, OrderedGroupIndexedType))):
                    subregions = sorted(subregions,
                                        key=lambda subregion: reading_order[
                                            subregion.id].index)
                region_unicode = page_element_unicode0(subregions[0])
                for subregion, next_subregion in zip(subregions,
                                                     subregions[1:]):
                    if not (subregion.id, next_subregion.id) in joins:
                        region_unicode += '\n'  # or '\f'?
                    region_unicode += page_element_unicode0(next_subregion)
                region_conf = sum(
                    page_element_conf0(subregion) for subregion in subregions)
                region_conf /= len(subregions)
            else:  # TODO: what if a TextRegion has both TextLine and TextRegion children?
                lines = region.get_TextLine()
                if ((region.get_textLineOrder() or page.get_textLineOrder()
                     ) == TextLineOrderSimpleType.BOTTOMTOTOP):
                    lines = list(reversed(lines))
                if level != 'line':
                    for line in lines:
                        words = line.get_Word()
                        if ((line.get_readingDirection()
                             or region.get_readingDirection()
                             or page.get_readingDirection()
                             ) == ReadingDirectionSimpleType.RIGHTTOLEFT):
                            words = list(reversed(words))
                        if level != 'word':
                            for word in words:
                                glyphs = word.get_Glyph()
                                if ((
                                        word.get_readingDirection()
                                        or line.get_readingDirection()
                                        or region.get_readingDirection()
                                        or page.get_readingDirection()
                                ) == ReadingDirectionSimpleType.RIGHTTOLEFT):
                                    glyphs = list(reversed(glyphs))
                                word_unicode = ''.join(
                                    page_element_unicode0(glyph)
                                    for glyph in glyphs)
                                word_conf = sum(
                                    page_element_conf0(glyph)
                                    for glyph in glyphs)
                                if glyphs:
                                    word_conf /= len(glyphs)
                                word.set_TextEquiv(  # replace old, if any
                                    [
                                        TextEquivType(Unicode=word_unicode,
                                                      conf=word_conf)
                                    ])
                        line_unicode = ' '.join(
                            page_element_unicode0(word) for word in words)
                        line_conf = sum(
                            page_element_conf0(word) for word in words)
                        if words:
                            line_conf /= len(words)
                        line.set_TextEquiv(  # replace old, if any
                            [
                                TextEquivType(Unicode=line_unicode,
                                              conf=line_conf)
                            ])
                region_unicode = ''
                region_conf = 0
                if lines:
                    region_unicode = page_element_unicode0(lines[0])
                    for line, next_line in zip(lines, lines[1:]):
                        words = line.get_Word()
                        next_words = next_line.get_Word()
                        if not (words and next_words and
                                (words[-1].id, next_words[0].id) in joins):
                            region_unicode += '\n'
                        region_unicode += page_element_unicode0(next_line)
                    region_conf = sum(
                        page_element_conf0(line) for line in lines)
                    region_conf /= len(lines)
            region.set_TextEquiv(  # replace old, if any
                [TextEquivType(Unicode=region_unicode, conf=region_conf)])
示例#29
0
    def process_lines(self, textlines, predfiles, fgrp, regionid):

        for line in textlines:

            for file in predfiles:
                if file == '-'.join([fgrp, regionid, line.id]):
                    self.log.info("Processing text in line '%s'", line.id)

                    filepath = self.root + '/' + file + '.json'
                    with open(filepath) as f:
                        data = json.load(f)

                        linepred = data['predictions'][0]['sentence']
                        line_conf = []
                        line_pos = []

                        w = ''
                        word_conf = []
                        words = []
                        word_pos = []

                        positions = data['predictions'][0]['positions']
                        for i, d in enumerate(positions):
                            char = d['chars'][0]['char']
                            char_conf = d['chars'][0]['probability']
                            char_pos = (d['globalStart'], d['globalEnd'])

                            if char == ' ':
                                words.append(w)
                                w = ''
                                line_conf.append(word_conf)
                                word_conf = []
                                line_pos.append(word_pos)
                                word_pos = []
                            else:
                                w += char
                                word_conf.append(char_conf)
                                word_pos.append(char_pos)
                                if i == len(positions) - 1:
                                    words.append(w)
                                    line_conf.append(word_conf)
                                    line_pos.append(word_pos)

                        wconfs = [(min(conf) + max(conf)) / 2
                                  for conf in line_conf]
                        lineconf = (min(wconfs) + max(wconfs)) / 2

                        line.replace_TextEquiv_at(
                            0,
                            TextEquivType(Unicode=linepred,
                                          conf=str(lineconf)))

                        if self.maxlevel == 'word' or 'glyph':
                            box = bounding_box(line.get_Coords().points)
                            line.Word = []
                            for w_no, w in enumerate(words):

                                # Coords of word
                                wordbounding = (line_pos[w_no][0][0],
                                                line_pos[w_no][-1][-1])
                                word_bbox = [
                                    box[0] + wordbounding[0], box[1],
                                    box[2] + wordbounding[1], box[3]
                                ]

                                word_id = '%s_word%04d' % (line.id, w_no)
                                word = WordType(
                                    id=word_id,
                                    Coords=CoordsType(
                                        points_from_x0y0x1y1(word_bbox)))

                                line.add_Word(word)
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=w,
                                                  conf=str(wconfs[w_no])))

                                if self.maxlevel == 'glyph':
                                    for glyph_no, g in enumerate(w):
                                        glyphbounding = (
                                            line_pos[w_no][glyph_no][0],
                                            line_pos[w_no][glyph_no][-1])
                                        glyph_bbox = [
                                            box[0] + glyphbounding[0], box[1],
                                            box[2] + glyphbounding[1], box[3]
                                        ]

                                        glyph_id = '%s_glyph%04d' % (word.id,
                                                                     glyph_no)
                                        glyph = GlyphType(
                                            id=glyph_id,
                                            Coords=CoordsType(
                                                points_from_x0y0x1y1(
                                                    glyph_bbox)))

                                        word.add_Glyph(glyph)
                                        glyph.add_TextEquiv(
                                            TextEquivType(
                                                Unicode=g,
                                                conf=str(line_conf[w_no]
                                                         [glyph_no])))
示例#30
0
    def process(self):
        """
        Perform text recognition with Calamari on the workspace.

        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
        """
        log = getLogger('processor.CalamariRecognize')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector=self.features)

            for region in page.get_AllRegions(classes=['Text']):
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector=self.features)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                line_images_np = []
                line_coordss = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_selector=self.features)
                    if ('binarized' not in line_coords['features']
                            and 'grayscale_normalized'
                            not in line_coords['features']
                            and self.network_input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
                        log.warning(
                            "Using raw image for line '%s' in region '%s'",
                            line.id, region.id)

                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                    line_coordss.append(line_coords)
                raw_results_all = self.predictor.predict_raw(
                    line_images_np, progress_bar=False)

                for line, line_coords, raw_results in zip(
                        textlines, line_coordss, raw_results_all):

                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            self.add_metadata(pcgts)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))