def _split_word_at_space(word): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh = xywh_from_points(word.get_Coords().points) textequiv = word.get_TextEquiv()[0] pos = textequiv.Unicode.index(" ") fract = pos / len(textequiv.Unicode) xywh_prev = xywh.copy() xywh_prev.update({'w': xywh['w'] * fract}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh.copy() xywh_next.update({ 'x': xywh['x'] + xywh['w'] * fract, 'w': xywh['w'] * (1 - fract) }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) # Glyphs: irrelevant at this processing level textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos], conf=textequiv.conf) textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:], conf=textequiv.conf) prev_.set_TextEquiv([textequiv_prev]) next_.set_TextEquiv([textequiv_next]) return prev_, next_
def _page_update_higher_textequiv_levels(level, pcgts): """Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency. Starting with the hierarchy level chosen for processing, join all first TextEquiv (by the rules governing the respective level) into TextEquiv of the next higher level, replacing them. """ regions = pcgts.get_Page().get_TextRegion() if level != 'region': for region in regions: lines = region.get_TextLine() if level != 'line': for line in lines: words = line.get_Word() if level != 'word': for word in words: glyphs = word.get_Glyph() word_unicode = u''.join(glyph.get_TextEquiv( )[0].Unicode if glyph.get_TextEquiv() else u'' for glyph in glyphs) word.set_TextEquiv([ TextEquivType(Unicode=word_unicode) ]) # remove old line_unicode = u' '.join(word.get_TextEquiv( )[0].Unicode if word.get_TextEquiv() else u'' for word in words) line.set_TextEquiv([TextEquivType(Unicode=line_unicode) ]) # remove old region_unicode = u'\n'.join(line.get_TextEquiv( )[0].Unicode if line.get_TextEquiv() else u'' for line in lines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode) ]) # remove old
def process(self): """ Performs the (text) recognition. """ mainIndex = self.parameter['mainIndex'] for (n, input_file) in enumerate(self.input_files): alignurl = input_file.url pcgts = parse(alignurl, True) page = pcgts.get_Page() regions = page.get_TextRegion() pagecontent = '' for region in regions: regioncontent = '' lines = region.get_TextLine() for line in lines: linecontent = '' words = line.get_Word() for word in words: wordunicode = word.get_TextEquiv()[mainIndex].Unicode word.add_TextEquiv(TextEquivType(Unicode=wordunicode)) linecontent += ' ' + wordunicode line.add_TextEquiv(TextEquivType(Unicode=regioncontent)) regioncontent += '\n' + linecontent region.add_TextEquiv(TextEquivType(Unicode=regioncontent)) pagecontent += '\n' + regioncontent page.add_TextEquiv(TextEquivType(Unicode=pagecontent)) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh): LOG = getLogger('processor.TesserocrRecognize') for glyph in glyphs: glyph_image, _ = self.workspace.image_from_segment( glyph, word_image, word_xywh) if self.parameter['padding']: tessapi.SetImage(pad_image(glyph_image, self.parameter['padding'])) else: tessapi.SetImage(glyph_image) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) LOG.debug("Recognizing text in glyph '%s'", glyph.id) if glyph.get_TextEquiv(): LOG.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0]/100.0 if glyph_conf else 1.0 #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) result_it = tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): LOG.error("No text in glyph '%s'", glyph.id) continue choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
def _process_existing_words(self, tessapi, words, line_image, line_xywh): LOG = getLogger('processor.TesserocrRecognize') for word in words: word_image, word_xywh = self.workspace.image_from_segment( word, line_image, line_xywh) if self.parameter['padding']: tessapi.SetImage(pad_image(word_image, self.parameter['padding'])) else: tessapi.SetImage(word_image) tessapi.SetPageSegMode(PSM.SINGLE_WORD) if self.parameter['textequiv_level'] == 'word': LOG.debug("Recognizing text in word '%s'", word.id) word_text = tessapi.GetUTF8Text().rstrip("\n\f") word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0]/100.0 if word_conf else 0.0 if word.get_TextEquiv(): LOG.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle()) word.add_TextEquiv(TextEquivType(Unicode=word_text, conf=word_conf)) continue # next word (to avoid indentation below) ## glyph level: glyphs = word.get_Glyph() if glyphs: ## external glyph layout: LOG.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) self._process_existing_glyphs(tessapi, glyphs, word_image, word_xywh) else: ## internal glyph layout: tessapi.Recognize() self._process_glyphs_in_word(tessapi.GetIterator(), word, word_xywh)
def _process_regions(self, tessapi, regions, page_image, page_xywh): LOG = getLogger('processor.TesserocrRecognize') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) if self.parameter['textequiv_level'] == 'region': if self.parameter['padding']: tessapi.SetImage(pad_image(region_image, self.parameter['padding'])) else: tessapi.SetImage(region_image) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) #if region.get_primaryScript() not in tessapi.GetLoadedLanguages()... LOG.debug("Recognizing text in region '%s'", region.id) region_text = tessapi.GetUTF8Text().rstrip("\n\f") region_conf = tessapi.MeanTextConf()/100.0 # iterator scores are arithmetic averages, too if region.get_TextEquiv(): LOG.warning("Region '%s' already contained text results", region.id) region.set_TextEquiv([]) # todo: consider SetParagraphSeparator region.add_TextEquiv(TextEquivType(Unicode=region_text, conf=region_conf)) continue # next region (to avoid indentation below) ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: LOG.warning("Region '%s' contains no text lines", region.id) else: self._process_lines(tessapi, textlines, region_image, region_xywh)
def combine_windows_to_graph(windows): ''' Combine windows FSTs containing hypotheses for given windows to a graph of hypotheses in `nx.DiGraph` format, with decoding alternatives represented as `TextEquivType` at the edges. This is suitable for decoding data supplied in PageXML input format. The windows are passed as a dictionary: (starting_position, length) -> window_fst ''' graph = nx.DiGraph() line_end_node = max(i + j for i, j in windows) graph.add_nodes_from(range(line_end_node + 1)) for (i, j), fst in windows.items(): start_node = i end_node = i + j paths = [(output_str, float(weight)) \ for input_str, output_str, weight in \ fst.paths().items()] if paths: for path in paths: logging.debug('({}, {}, \'{}\', {})'.format(\ start_node, end_node, path[0], pow(2, -path[1]))) graph.add_edge( start_node, end_node, element=None, alternatives=[ TextEquivType(Unicode=path[0], conf=pow(2, -path[1])) \ for path in paths \ ]) else: logging.warning('No path from {} to {}.'.format(i, i + j)) return graph
def _process_lines(self, textlines, maxlevel, tessapi): for line in textlines: log.debug("Recognizing text in line '%s'", line.id) line_xywh = xywh_from_points(line.get_Coords().points) # log.debug("xywh: %s", line_xywh) tessapi.SetRectangle(line_xywh['x'], line_xywh['y'], line_xywh['w'], line_xywh['h']) tessapi.SetPageSegMode( PSM.SINGLE_LINE ) # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models if maxlevel == 'line': line_text = tessapi.GetUTF8Text().rstrip("\n\f") line_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv( TextEquivType(Unicode=line_text, conf=line_conf)) continue # next line (to avoid indentation below) ## word, or glyph level: words = line.get_Word() if words: ## external word layout: log.warning( "Line '%s' contains words already, recognition might be suboptimal", line.id) self._process_existing_words(words, maxlevel, tessapi) else: ## internal word and glyph layout: tessapi.Recognize() self._process_words_in_line(line, maxlevel, tessapi.GetIterator())
def set_text(node, text, page_textequiv_strategy): """ Set the first or most confident among text results (depending on ``page_textequiv_strategy``). For the strategy ``best``, set the string of the highest scoring result. For the strategy ``first``, set the string of the lowest indexed result. If there are no scores/indexes, use the first result. If there are no results, add a new one. """ text = text.strip() textEquivs = node.get_TextEquiv() if not textEquivs: node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ? elif page_textequiv_strategy == 'best': if len(textEquivs) > 1: textEquivsSorted = sorted( [x for x in textEquivs if x.conf], # generateDS does not convert simpleType for attributes (yet?) key=lambda x: float(x.conf)) if textEquivsSorted: textEquivsSorted[-1].set_Unicode(text) return # fall back to first element textEquivs[0].set_Unicode(text) #elif page_textequiv_strategy == 'first': else: if len(textEquivs) > 1: textEquivsSorted = sorted( [x for x in textEquivs if isinstance(x.index, int)], key=lambda x: x.index) if textEquivsSorted: textEquivsSorted[0].set_Unicode(text) return # fall back to first element textEquivs[0].set_Unicode(text)
def _process_existing_words(self, words, maxlevel, tessapi): for word in words: log.debug("Recognizing text in word '%s'", word.id) word_xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(word_xywh['x'], word_xywh['y'], word_xywh['w'], word_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) if maxlevel == 'word': word_text = tessapi.GetUTF8Text().rstrip("\n\f") word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0] / 100.0 if word_conf else 0.0 if word.get_TextEquiv(): log.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle()) word.add_TextEquiv( TextEquivType(Unicode=word_text, conf=word_conf)) continue # next word (to avoid indentation below) ## glyph level: glyphs = word.get_Glyph() if glyphs: ## external glyph layout: log.warning( "Word '%s' contains glyphs already, recognition might be suboptimal", word.id) self._process_existing_glyphs(glyphs, tessapi) else: ## internal glyph layout: tessapi.Recognize() self._process_glyphs_in_word(word, tessapi.GetIterator())
def _combine_windows_to_line_graph(self, windows): LOG = getLogger('processor.FSTCorrection') graph = nx.DiGraph() line_end_node = max(i+j for i, j in windows) graph.add_nodes_from(range(line_end_node + 1)) for (i, j), (ref, fst, tokens) in windows.items(): start_node = i end_node = i + j # FIXME: this will NOT work without spaces and newlines (as before 81dd2c0c)! paths = [(output_str, float(weight)) \ for input_str, output_str, weight in \ fst.paths().items()] if paths: for path in paths: LOG.info('({}, {}, \'{}\', {})'.format(\ start_node, end_node, path[0], pow(2, -path[1]))) graph.add_edge( start_node, end_node, element=ref, alternatives=list(map( lambda path: TextEquivType(Unicode=path[0], conf=pow(2, -path[1])), paths))) else: LOG.warning('No path from {} to {}.'.format(i, i+j)) return graph
def _process_existing_glyphs(self, glyphs, tessapi): for glyph in glyphs: log.debug("Recognizing glyph in word '%s'", glyph.id) glyph_xywh = xywh_from_points(glyph.get_Coords().points) tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'], glyph_xywh['w'], glyph_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) if glyph.get_TextEquiv(): log.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0 #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) result_it = tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): log.error("No glyph here") continue choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence() / 100 #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv( TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
def process_regions(self, regions, maxlevel, page_image, page_coords): edits = 0 lengs = 0 for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) self.logger.info("Recognizing text in region '%s'", region.id) textlines = region.get_TextLine() if not textlines: self.logger.warning("Region '%s' contains no text lines", region.id) else: edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords) edits += edits_ lengs += lengs_ # update region text by concatenation for consistency region_unicode = u'\n'.join(line.get_TextEquiv( )[0].Unicode if line.get_TextEquiv() else u'' for line in textlines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) if lengs > 0: self.logger.info('CER: %.1f%%', 100.0 * edits / lengs)
def _combine_windows_to_line_graph(self, windows): graph = nx.DiGraph() line_end_node = max(i + j for i, j in windows) graph.add_nodes_from(range(line_end_node + 1)) for (i, j), (ref, fst, tokens) in windows.items(): start_node = i end_node = i + j paths = [(output_str, float(weight)) \ for input_str, output_str, weight in \ fst.paths().items()] if paths: for path in paths: LOG.info('({}, {}, \'{}\', {})'.format(\ start_node, end_node, path[0], pow(2, -path[1]))) graph.add_edge( start_node, end_node, element=ref, alternatives=list( map( lambda path: TextEquivType(Unicode=path[0], conf=pow(2, -path[1])), paths))) else: LOG.warning('No path from {} to {}.'.format(i, i + j)) return graph
def test_validate_multi_textequiv_index1(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True) self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs del(word.get_TextEquiv()[0]) # Add textequiv set_text(word, 'FOO', 'index1') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=1)) self.assertEqual(get_text(word, 'index1'), 'BAZ') set_text(word, 'XYZ', 'index1') self.assertEqual(get_text(word, 'index1'), 'XYZ')
def test_validate_multi_textequiv_first(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs word.set_TextEquiv([]) # Add textequiv set_text(word, 'FOO', 'first') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0)) self.assertEqual(get_text(word, 'first'), 'BAZ') set_text(word, 'XYZ', 'first') self.assertEqual(get_text(word, 'first'), 'XYZ')
def _process_words_in_line(self, result_it, line, line_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): word_no = 0 while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) LOG.debug("Decoding text in word '%s'", word_id) bbox = result_it.BoundingBox(RIL.WORD) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, line) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', points) else: line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=word_attributes['bold'] if 'bold' in word_attributes else None, italic=word_attributes['italic'] if 'italic' in word_attributes else None, underlined=word_attributes['underlined'] if 'underlined' in word_attributes else None, monospace=word_attributes['monospace'] if 'monospace' in word_attributes else None, serif=word_attributes['serif'] if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv(TextEquivType( Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) if self.parameter['textequiv_level'] != 'word': self._process_glyphs_in_word(result_it, word, line_xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: word_no += 1 result_it.Next(RIL.WORD)
def _process_lines(self, tessapi, textlines, region_image, region_xywh): for line in textlines: if self.parameter['overwrite_words']: line.set_Word([]) line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) # todo: Tesseract works better if the line images have a 5px margin everywhere if self.parameter['padding']: bg = tuple(ImageStat.Stat(line_image).median) pad = self.parameter['padding'] padded = Image.new( line_image.mode, (line_image.width + 2 * pad, line_image.height + 2 * pad), bg) padded.paste(line_image, (pad, pad)) tessapi.SetImage(padded) else: tessapi.SetImage(line_image) if self.parameter['raw_lines']: tessapi.SetPageSegMode(PSM.RAW_LINE) else: tessapi.SetPageSegMode(PSM.SINGLE_LINE) #if line.get_primaryScript() not in tessapi.GetLoadedLanguages()... LOG.debug("Recognizing text in line '%s'", line.id) if self.parameter['textequiv_level'] == 'line': line_text = tessapi.GetUTF8Text().rstrip("\n\f") line_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too if line.get_TextEquiv(): LOG.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv( TextEquivType(Unicode=line_text, conf=line_conf)) continue # next line (to avoid indentation below) ## word, or glyph level: words = line.get_Word() if words: ## external word layout: LOG.warning( "Line '%s' contains words already, recognition might be suboptimal", line.id) self._process_existing_words(tessapi, words, line_image, line_xywh) else: ## internal word and glyph layout: tessapi.Recognize() self._process_words_in_line(tessapi.GetIterator(), line, line_xywh)
def _process_words_in_line(self, line, maxlevel, result_it): for word_no in range( 0, MAX_ELEMENTS ): # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD) if not result_it: log.error("No iterator at '%s'", line.id) break if result_it.Empty(RIL.WORD): log.debug("No word here") break word_id = '%s_word%04d' % (line.id, word_no) log.debug("Recognizing text in word '%s'", word_id) word_bbox = result_it.BoundingBox(RIL.WORD) word = WordType(id=word_id, Coords=CoordsType(points_from_x0y0x1y1(word_bbox))) line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=None if 'bold' not in word_attributes else word_attributes['bold'], italic=None if 'italic' not in word_attributes else word_attributes['italic'], underlined=None if 'underlined' not in word_attributes else word_attributes['underlined'], monospace=None if 'monospace' not in word_attributes else word_attributes['monospace'], serif=None if 'serif' not in word_attributes else word_attributes['serif']) word.set_TextStyle( word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv( TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD) / 100)) if maxlevel == 'word': pass else: self._process_glyphs_in_word(word, result_it) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: result_it.Next(RIL.WORD)
def set_text(node, text, strategy): """ Set the most confident text results, either those with @index = 1, the first text results or add new one. """ text = text.strip() textEquivs = node.get_TextEquiv() if not textEquivs: node.add_TextEquiv(TextEquivType(Unicode=text)) # elif strategy == 'index1': else: if len(textEquivs) > 1: index1 = [x for x in textEquivs if x.index == 1] if index1: index1[0].set_Unicode(text) return textEquivs[0].set_Unicode(text)
def _process_regions(self, regions, maxlevel, tessapi): for region in regions: log.debug("Recognizing text in region '%s'", region.id) # todo: determine if and how this can still be used for region classification: # result_it = tessapi.GetIterator() # if not result_it or result_it.Empty(RIL.BLOCK) # ptype = result_it.BlockType() # PT.UNKNOWN # PT.FLOWING_TEXT # PT.HEADING_TEXT # PT.PULLOUT_TEXT # PT.EQUATION # PT.TABLE # PT.VERTICAL_TEXT # PT.CAPTION_TEXT # PT.HORZ_LINE # PT.VERT_LINE # PT.NOISE # PT.COUNT # ... if maxlevel == 'region': region_xywh = xywh_from_points(region.get_Coords().points) tessapi.SetRectangle(region_xywh['x'], region_xywh['y'], region_xywh['w'], region_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) region_text = tessapi.GetUTF8Text().rstrip("\n\f") region_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too if region.get_TextEquiv(): log.warning("Region '%s' already contained text results", region.id) region.set_TextEquiv([]) # todo: consider SetParagraphSeparator region.add_TextEquiv( TextEquivType(Unicode=region_text, conf=region_conf)) continue # next region (to avoid indentation below) ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: log.warning("Region '%s' contains no text lines", region.id) else: self._process_lines(textlines, maxlevel, tessapi)
def _process_glyphs_in_word(self, result_it, word, word_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): glyph_no = 0 while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) bbox = result_it.BoundingBox(RIL.SYMBOL) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, word) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant glyph: %s', points) else: word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: glyph_no += 1 result_it.Next(RIL.SYMBOL)
def _add_space(graph, start_node, space, last_start_node, problem, textequivs): """add a pseudo-element edge for the white-space string `space` to `graph`, between `start_node` and new node `start_node`+1, except if there is a tokenisation `problem` involving the first textequiv in the graph's current tip""" # tokenisation inconsistency does not apply if: # - element id not contained in detected problem set # - there is no TextEquiv to compare with at the next token # - the element is first of its kind (i.e. must not start with white space anyway) if (textequivs and textequivs[0].Unicode and problem and _repair_tokenisation( problem.actual, u''.join( map(lambda x: x['alternatives'][0].Unicode, _get_edges(graph, last_start_node))), textequivs[0].Unicode)): pass # skip all rules for concatenation joins else: # joining space required for LM input here? start_node = _add_element(graph, start_node, None, [TextEquivType(Unicode=space, conf=1.0)]) # LM output will not appear in annotation # (so conf cannot be combined to accurate perplexity from output) return start_node
def _process_glyphs_in_word(self, word, result_it): for glyph_no in range( 0, MAX_ELEMENTS ): # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL) if not result_it: log.error("No iterator at '%s'", word.id) break if result_it.Empty(RIL.SYMBOL): log.debug("No glyph here") break glyph_id = '%s_glyph%04d' % (word.id, glyph_no) log.debug("Recognizing text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence( RIL.SYMBOL) / 100 # equals first choice? #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) glyph_bbox = result_it.BoundingBox(RIL.SYMBOL) glyph = GlyphType(id=glyph_id, Coords=CoordsType( points_from_x0y0x1y1(glyph_bbox))) word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence() / 100 #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv( TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: result_it.Next(RIL.SYMBOL)
def _merge_words(prev_, next_): merged = WordType(id=prev_.id + '.' + next_.id) merged.set_Coords( CoordsType(points=points_from_xywh( xywh_from_points(prev_.get_Coords().points + ' ' + next_.get_Coords().points)))) if prev_.get_language(): merged.set_language(prev_.get_language()) if prev_.get_TextStyle(): merged.set_TextStyle(prev_.get_TextStyle()) if prev_.get_Glyph() or next_.get_Glyph(): merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph()) if prev_.get_TextEquiv(): merged.set_TextEquiv(prev_.get_TextEquiv()) else: merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)]) if next_.get_TextEquiv(): textequiv = merged.get_TextEquiv()[0] textequiv2 = next_.get_TextEquiv()[0] textequiv.Unicode += textequiv2.Unicode if textequiv.conf and textequiv2.conf: textequiv.conf *= textequiv2.conf return merged
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _page_get_line_sequences_at(level, pcgts): '''Get TextEquiv sequences for PAGE-XML hierarchy level including whitespace. Return a list of lines from the document `pcgts`, where each line is a list of 3-tuples containing TextEquiv / Word / TextLine objects from the given hierarchy `level`. This includes artificial objects for implicit whitespace between elements (marked by `index=-1`, which is forbidden in the XML Schema). (If `level` is `glyph`, then the Word reference will be the Word that contains the Glyph which contains the TextEquiv. If `level` is `word`, then the Word reference will be the Word which contains the TextEquiv. If `level` is `line`, then the Word reference will be None.) ''' LOG = getLogger('processor.ANNCorrection') sequences = list() word = None # make accessible after loop line = None # make accessible after loop regions = pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order') if not regions: LOG.warning("Page contains no text regions") for region in regions: lines = region.get_TextLine() if not lines: LOG.warning("Region '%s' contains no text lines", region.id) for line in lines: sequences.append([]) if level == 'line': #LOG.debug("Getting text in line '%s'", line.id) textequivs = line.get_TextEquiv() if textequivs: sequences[-1].append((textequivs[0], word, line)) else: LOG.warning("Line '%s' contains no text results", line.id) else: words = line.get_Word() if not words: LOG.warning("Line '%s' contains no word", line.id) for word in words: if level == 'word': #LOG.debug("Getting text in word '%s'", word.id) textequivs = word.get_TextEquiv() if textequivs: sequences[-1].append((textequivs[0], word, line)) else: LOG.warning("Word '%s' contains no text results", word.id) continue # no inter-word else: glyphs = word.get_Glyph() if not glyphs: LOG.warning("Word '%s' contains no glyphs", word.id) continue # no inter-word for glyph in glyphs: #LOG.debug("Getting text in glyph '%s'", glyph.id) textequivs = glyph.get_TextEquiv() if textequivs: sequences[-1].append( (textequivs[0], word, line)) else: LOG.warning( "Glyph '%s' contains no text results", glyph.id) # treat as gap textequivs = [ TextEquivType(Unicode='', conf=1.0) ] glyph.set_TextEquiv(textequivs) sequences[-1].append( (textequivs[0], word, line)) sequences[-1].append((TextEquivType(Unicode=' ', conf=1.0, index=-1), word, line)) if sequences[-1]: sequences[-1].pop() # no inter-word sequences[-1].append((TextEquivType(Unicode='\n', conf=1.0, index=-1), word, line)) # filter empty lines (containing only newline): return [line for line in sequences if len(line) > 1]
def page_update_higher_textequiv_levels(level, pcgts): """Update the TextEquivs of all PAGE-XML hierarchy levels above ``level`` for consistency. Starting with the hierarchy level chosen for processing, join all first TextEquiv.Unicode (by the rules governing the respective level) into TextEquiv.Unicode of the next higher level, replacing them. When two successive elements appear in a ``Relation`` of type ``join``, then join them directly (without their respective white space). Likewise, average all first TextEquiv.conf into TextEquiv.conf of the next higher level. In the process, traverse the words and lines in their respective ``readingDirection``, the (text) regions which contain lines in their respective ``textLineOrder``, and the (text) regions which contain text regions in their ``ReadingOrder`` (if they appear there as an ``OrderedGroup``). Where no direction/order can be found, use XML ordering. Follow regions recursively, but make sure to traverse them in a depth-first strategy. """ page = pcgts.get_Page() relations = page.get_Relations() # get RelationsType if relations: relations = relations.get_Relation() # get list of RelationType else: relations = [] joins = list() # for relation in relations: if relation.get_type() == 'join': # ignore 'link' type here joins.append((relation.get_SourceRegionRef().get_regionRef(), relation.get_TargetRegionRef().get_regionRef())) reading_order = dict() ro = page.get_ReadingOrder() if ro: page_get_reading_order( reading_order, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) if level != 'region': for region in itertools.chain.from_iterable( # order is important here, because regions can be recursive, # and we want to concatenate by depth first; # typical recursion structures would be: # - TextRegion/@type=paragraph inside TextRegion # - TextRegion/@type=drop-capital followed by TextRegion/@type=paragraph inside TextRegion # - any region (including TableRegion or TextRegion) inside a TextRegion/@type=footnote # - TextRegion inside TableRegion [ subregion.get_TextRegion() for subregion in page.get_TextRegion() ] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ] + [page.get_TextRegion()]): subregions = region.get_TextRegion() if subregions: # already visited in earlier iterations # do we have a reading order for these? # TODO: what if at least some of the subregions are in reading_order? if (all(subregion.id in reading_order for subregion in subregions) and isinstance( reading_order[ subregions[0].id], # all have .index? (OrderedGroupType, OrderedGroupIndexedType))): subregions = sorted(subregions, key=lambda subregion: reading_order[ subregion.id].index) region_unicode = page_element_unicode0(subregions[0]) for subregion, next_subregion in zip(subregions, subregions[1:]): if not (subregion.id, next_subregion.id) in joins: region_unicode += '\n' # or '\f'? region_unicode += page_element_unicode0(next_subregion) region_conf = sum( page_element_conf0(subregion) for subregion in subregions) region_conf /= len(subregions) else: # TODO: what if a TextRegion has both TextLine and TextRegion children? lines = region.get_TextLine() if ((region.get_textLineOrder() or page.get_textLineOrder() ) == TextLineOrderSimpleType.BOTTOMTOTOP): lines = list(reversed(lines)) if level != 'line': for line in lines: words = line.get_Word() if ((line.get_readingDirection() or region.get_readingDirection() or page.get_readingDirection() ) == ReadingDirectionSimpleType.RIGHTTOLEFT): words = list(reversed(words)) if level != 'word': for word in words: glyphs = word.get_Glyph() if (( word.get_readingDirection() or line.get_readingDirection() or region.get_readingDirection() or page.get_readingDirection() ) == ReadingDirectionSimpleType.RIGHTTOLEFT): glyphs = list(reversed(glyphs)) word_unicode = ''.join( page_element_unicode0(glyph) for glyph in glyphs) word_conf = sum( page_element_conf0(glyph) for glyph in glyphs) if glyphs: word_conf /= len(glyphs) word.set_TextEquiv( # replace old, if any [ TextEquivType(Unicode=word_unicode, conf=word_conf) ]) line_unicode = ' '.join( page_element_unicode0(word) for word in words) line_conf = sum( page_element_conf0(word) for word in words) if words: line_conf /= len(words) line.set_TextEquiv( # replace old, if any [ TextEquivType(Unicode=line_unicode, conf=line_conf) ]) region_unicode = '' region_conf = 0 if lines: region_unicode = page_element_unicode0(lines[0]) for line, next_line in zip(lines, lines[1:]): words = line.get_Word() next_words = next_line.get_Word() if not (words and next_words and (words[-1].id, next_words[0].id) in joins): region_unicode += '\n' region_unicode += page_element_unicode0(next_line) region_conf = sum( page_element_conf0(line) for line in lines) region_conf /= len(lines) region.set_TextEquiv( # replace old, if any [TextEquivType(Unicode=region_unicode, conf=region_conf)])
def process_lines(self, textlines, predfiles, fgrp, regionid): for line in textlines: for file in predfiles: if file == '-'.join([fgrp, regionid, line.id]): self.log.info("Processing text in line '%s'", line.id) filepath = self.root + '/' + file + '.json' with open(filepath) as f: data = json.load(f) linepred = data['predictions'][0]['sentence'] line_conf = [] line_pos = [] w = '' word_conf = [] words = [] word_pos = [] positions = data['predictions'][0]['positions'] for i, d in enumerate(positions): char = d['chars'][0]['char'] char_conf = d['chars'][0]['probability'] char_pos = (d['globalStart'], d['globalEnd']) if char == ' ': words.append(w) w = '' line_conf.append(word_conf) word_conf = [] line_pos.append(word_pos) word_pos = [] else: w += char word_conf.append(char_conf) word_pos.append(char_pos) if i == len(positions) - 1: words.append(w) line_conf.append(word_conf) line_pos.append(word_pos) wconfs = [(min(conf) + max(conf)) / 2 for conf in line_conf] lineconf = (min(wconfs) + max(wconfs)) / 2 line.replace_TextEquiv_at( 0, TextEquivType(Unicode=linepred, conf=str(lineconf))) if self.maxlevel == 'word' or 'glyph': box = bounding_box(line.get_Coords().points) line.Word = [] for w_no, w in enumerate(words): # Coords of word wordbounding = (line_pos[w_no][0][0], line_pos[w_no][-1][-1]) word_bbox = [ box[0] + wordbounding[0], box[1], box[2] + wordbounding[1], box[3] ] word_id = '%s_word%04d' % (line.id, w_no) word = WordType( id=word_id, Coords=CoordsType( points_from_x0y0x1y1(word_bbox))) line.add_Word(word) word.add_TextEquiv( TextEquivType(Unicode=w, conf=str(wconfs[w_no]))) if self.maxlevel == 'glyph': for glyph_no, g in enumerate(w): glyphbounding = ( line_pos[w_no][glyph_no][0], line_pos[w_no][glyph_no][-1]) glyph_bbox = [ box[0] + glyphbounding[0], box[1], box[2] + glyphbounding[1], box[3] ] glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType( id=glyph_id, Coords=CoordsType( points_from_x0y0x1y1( glyph_bbox))) word.add_Glyph(glyph) glyph.add_TextEquiv( TextEquivType( Unicode=g, conf=str(line_conf[w_no] [glyph_no])))
def process(self): """ Perform text recognition with Calamari on the workspace. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector=self.features) for region in page.get_AllRegions(classes=['Text']): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) line_images_np = [] line_coordss = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.network_input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: log.warning( "Using raw image for line '%s' in region '%s'", line.id, region.id) line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) raw_results_all = self.predictor.predict_raw( line_images_np, progress_bar=False) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all): for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))