示例#1
0
def _split_word_at_glyph(word, glyph):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh_glyph = xywh_from_points(glyph.get_Coords().points)
    xywh_word = xywh_from_points(word.get_Coords().points)
    xywh_prev = xywh_word.copy()
    xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh_word.copy()
    xywh_next.update({
        'x': xywh_glyph['x'] - xywh_glyph['w'],
        'w': xywh_word['w'] - xywh_prev['w']
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    glyphs = word.get_Glyph()
    pos = glyphs.index(glyph)
    prev_.set_Glyph(glyphs[0:pos])
    next_.set_Glyph(glyphs[pos + 1:])
    # TextEquiv: will be overwritten by page_update_higher_textequiv_levels
    return prev_, next_
示例#2
0
 def _process_existing_glyphs(self, glyphs, tessapi):
     for glyph in glyphs:
         log.debug("Recognizing glyph in word '%s'", glyph.id)
         glyph_xywh = xywh_from_points(glyph.get_Coords().points)
         tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'],
                              glyph_xywh['w'], glyph_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
         if glyph.get_TextEquiv():
             log.warning("Glyph '%s' already contained text results",
                         glyph.id)
             glyph.set_TextEquiv([])
         #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f")
         glyph_conf = tessapi.AllWordConfidences()
         glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0
         #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         result_it = tessapi.GetIterator()
         if not result_it or result_it.Empty(RIL.SYMBOL):
             log.error("No glyph here")
             continue
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence() / 100
             #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF
                     or choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(
                 TextEquivType(index=choice_no,
                               Unicode=alternative_text,
                               conf=alternative_conf))
示例#3
0
 def _process_existing_words(self, words, maxlevel, tessapi):
     for word in words:
         log.debug("Recognizing text in word '%s'", word.id)
         word_xywh = xywh_from_points(word.get_Coords().points)
         tessapi.SetRectangle(word_xywh['x'], word_xywh['y'],
                              word_xywh['w'], word_xywh['h'])
         tessapi.SetPageSegMode(PSM.SINGLE_WORD)
         if maxlevel == 'word':
             word_text = tessapi.GetUTF8Text().rstrip("\n\f")
             word_conf = tessapi.AllWordConfidences()
             word_conf = word_conf[0] / 100.0 if word_conf else 0.0
             if word.get_TextEquiv():
                 log.warning("Word '%s' already contained text results",
                             word.id)
                 word.set_TextEquiv([])
             # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle())
             word.add_TextEquiv(
                 TextEquivType(Unicode=word_text, conf=word_conf))
             continue  # next word (to avoid indentation below)
         ## glyph level:
         glyphs = word.get_Glyph()
         if glyphs:
             ## external glyph layout:
             log.warning(
                 "Word '%s' contains glyphs already, recognition might be suboptimal",
                 word.id)
             self._process_existing_glyphs(glyphs, tessapi)
         else:
             ## internal glyph layout:
             tessapi.Recognize()
             self._process_glyphs_in_word(word, tessapi.GetIterator())
示例#4
0
 def _process_lines(self, textlines, maxlevel, tessapi):
     for line in textlines:
         log.debug("Recognizing text in line '%s'", line.id)
         line_xywh = xywh_from_points(line.get_Coords().points)
         #  log.debug("xywh: %s", line_xywh)
         tessapi.SetRectangle(line_xywh['x'], line_xywh['y'],
                              line_xywh['w'], line_xywh['h'])
         tessapi.SetPageSegMode(
             PSM.SINGLE_LINE
         )  # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models
         if maxlevel == 'line':
             line_text = tessapi.GetUTF8Text().rstrip("\n\f")
             line_conf = tessapi.MeanTextConf(
             ) / 100.0  # iterator scores are arithmetic averages, too
             if line.get_TextEquiv():
                 log.warning("Line '%s' already contained text results",
                             line.id)
                 line.set_TextEquiv([])
             # todo: consider BlankBeforeWord, SetLineSeparator
             line.add_TextEquiv(
                 TextEquivType(Unicode=line_text, conf=line_conf))
             continue  # next line (to avoid indentation below)
         ## word, or glyph level:
         words = line.get_Word()
         if words:
             ## external word layout:
             log.warning(
                 "Line '%s' contains words already, recognition might be suboptimal",
                 line.id)
             self._process_existing_words(words, maxlevel, tessapi)
         else:
             ## internal word and glyph layout:
             tessapi.Recognize()
             self._process_words_in_line(line, maxlevel,
                                         tessapi.GetIterator())
示例#5
0
文件: test_utils.py 项目: b2m/core
 def test_xywh_from_points(self):
     self.assertEqual(xywh_from_points('100,100 200,100 200,200 100,200'), {
         'x': 100,
         'y': 100,
         'w': 100,
         'h': 100
     })
示例#6
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
示例#7
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
示例#8
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
示例#9
0
文件: test_utils.py 项目: b2m/core
 def test_xywh_from_points_unordered(self):
     self.assertEqual(
         xywh_from_points('500,500 100,100 200,100 200,200 100,200'), {
             'x': 100,
             'y': 100,
             'w': 400,
             'h': 400
         })
示例#10
0
 def _process_regions(self, regions, maxlevel, tessapi):
     for region in regions:
         log.debug("Recognizing text in region '%s'", region.id)
         # todo: determine if and how this can still be used for region classification:
         # result_it = tessapi.GetIterator()
         # if not result_it or result_it.Empty(RIL.BLOCK)
         # ptype = result_it.BlockType()
         # PT.UNKNOWN
         # PT.FLOWING_TEXT
         # PT.HEADING_TEXT
         # PT.PULLOUT_TEXT
         # PT.EQUATION
         # PT.TABLE
         # PT.VERTICAL_TEXT
         # PT.CAPTION_TEXT
         # PT.HORZ_LINE
         # PT.VERT_LINE
         # PT.NOISE
         # PT.COUNT
         # ...
         if maxlevel == 'region':
             region_xywh = xywh_from_points(region.get_Coords().points)
             tessapi.SetRectangle(region_xywh['x'], region_xywh['y'],
                                  region_xywh['w'], region_xywh['h'])
             tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
             region_text = tessapi.GetUTF8Text().rstrip("\n\f")
             region_conf = tessapi.MeanTextConf(
             ) / 100.0  # iterator scores are arithmetic averages, too
             if region.get_TextEquiv():
                 log.warning("Region '%s' already contained text results",
                             region.id)
                 region.set_TextEquiv([])
             # todo: consider SetParagraphSeparator
             region.add_TextEquiv(
                 TextEquivType(Unicode=region_text, conf=region_conf))
             continue  # next region (to avoid indentation below)
         ## line, word, or glyph level:
         textlines = region.get_TextLine()
         if not textlines:
             log.warning("Region '%s' contains no text lines", region.id)
         else:
             self._process_lines(textlines, maxlevel, tessapi)
示例#11
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged
def image_from_segment(workspace, segment, parent_image, parent_xywh):
    """Extract a segment image from its parent's image.
    
    Given a PIL.Image of the parent, `parent_image`, and
    its absolute coordinates, `parent_xywh`, and a PAGE
    segment (TextRegion / TextLine / Word / Glyph) object
    logically contained in it, `segment`, extract its PIL.Image
    from AlternativeImage (if it exists), or via cropping from
    `parent_image`.
    
    When cropping, respect any orientation angle annotated for
    the parent (from parent-level deskewing) by compensating the
    segment coordinates in an inverse transformation (translation
    to center, rotation, re-translation).
    Also, mind the difference between annotated and actual size
    of the parent (usually from deskewing), by a respective offset
    into the image. Cropping uses a polygon mask (not just the
    rectangle).
    
    When cropping, respect any orientation angle annotated for
    the segment (from segment-level deskewing) by rotating the
    cropped image, respectively.
    
    If the resulting segment image is larger than the bounding box of
    `segment`, pass down the segment's box coordinates with an offset
    of half the width/height difference.
    
    Return the extracted image, and the absolute coordinates of
    the segment's bounding box (for passing down).
    """
    segment_xywh = xywh_from_points(segment.get_Coords().points)
    if 'orientation' in segment.__dict__:
        # angle: PAGE orientation is defined clockwise,
        # whereas PIL/ndimage rotation is in mathematical direction:
        segment_xywh['angle'] = -(segment.get_orientation() or 0)
    alternative_image = segment.get_AlternativeImage()
    if alternative_image:
        # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
        LOG.debug("Using AlternativeImage %d (%s) for segment '%s'",
                  len(alternative_image), alternative_image[-1].get_comments(),
                  segment.id)
        segment_image = workspace.resolve_image_as_pil(
            alternative_image[-1].get_filename())
    else:
        # get polygon outline of segment relative to parent image:
        segment_polygon = coordinates_of_segment(segment, parent_image,
                                                 parent_xywh)
        # create a mask from the segment polygon:
        segment_image = image_from_polygon(parent_image, segment_polygon)
        # recrop into segment rectangle:
        segment_image = crop_image(
            segment_image,
            box=(segment_xywh['x'] - parent_xywh['x'],
                 segment_xywh['y'] - parent_xywh['y'],
                 segment_xywh['x'] - parent_xywh['x'] + segment_xywh['w'],
                 segment_xywh['y'] - parent_xywh['y'] + segment_xywh['h']))
        # note: We should mask overlapping neighbouring segments here,
        # but finding the right clipping rules can be difficult if operating
        # on the raw (non-binary) image data alone: for each intersection, it
        # must be decided which one of either segment or neighbour to assign,
        # e.g. an ImageRegion which properly contains our TextRegion should be
        # completely ignored, but an ImageRegion which is properly contained
        # in our TextRegion should be completely masked, while partial overlap
        # may be more difficult to decide. On the other hand, on the binary image,
        # we can use connected component analysis to mask foreground areas which
        # originate in the neighbouring regions. But that would introduce either
        # the assumption that the input has already been binarized, or a dependency
        # on some ad-hoc binarization method. Thus, it is preferable to use
        # a dedicated processor for this (which produces clipped AlternativeImage
        # or reduced polygon coordinates).
        if 'angle' in segment_xywh and segment_xywh['angle']:
            LOG.info("About to rotate segment '%s' by %.2f°", segment.id,
                     segment_xywh['angle'])
            segment_image = segment_image.rotate(
                segment_xywh['angle'],
                expand=True,
                #resample=Image.BILINEAR,
                fillcolor='white')
    # subtract offset from any increase in binary region size over source:
    segment_xywh['x'] -= round(0.5 *
                               max(0, segment_image.width - segment_xywh['w']))
    segment_xywh['y'] -= round(
        0.5 * max(0, segment_image.height - segment_xywh['h']))
    return segment_image, segment_xywh
def image_from_page(workspace, page, page_id):
    """Extract the Page image from the workspace.
    
    Given a PageType object, `page`, extract its PIL.Image from
    AlternativeImage if it exists. Otherwise extract the PIL.Image
    from imageFilename and crop it if a Border exists. Otherwise
    just return it.
    
    When cropping, respect any orientation angle annotated for
    the page (from page-level deskewing) by rotating the
    cropped image, respectively.
    
    If the resulting page image is larger than the bounding box of
    `page`, pass down the page's box coordinates with an offset of
    half the width/height difference.
    
    Return the extracted image, and the absolute coordinates of
    the page's bounding box / border (for passing down), and
    an OcrdExif instance associated with the original image.
    """
    page_image = workspace.resolve_image_as_pil(page.imageFilename)
    page_image_info = OcrdExif(page_image)
    page_xywh = {'x': 0, 'y': 0, 'w': page_image.width, 'h': page_image.height}
    # region angle: PAGE orientation is defined clockwise,
    # whereas PIL/ndimage rotation is in mathematical direction:
    page_xywh['angle'] = -(page.get_orientation() or 0)
    # FIXME: remove PrintSpace here as soon as GT abides by the PAGE standard:
    border = page.get_Border() or page.get_PrintSpace()
    if border:
        page_points = border.get_Coords().points
        LOG.debug("Using explictly set page border '%s' for page '%s'",
                  page_points, page_id)
        page_xywh = xywh_from_points(page_points)

    alternative_image = page.get_AlternativeImage()
    if alternative_image:
        # (e.g. from page-level cropping, binarization, deskewing or despeckling)
        # assumes implicit cropping (i.e. page_xywh has been applied already)
        LOG.debug("Using AlternativeImage %d (%s) for page '%s'",
                  len(alternative_image), alternative_image[-1].get_comments(),
                  page_id)
        page_image = workspace.resolve_image_as_pil(
            alternative_image[-1].get_filename())
    elif border:
        # get polygon outline of page border:
        page_polygon = np.array(polygon_from_points(page_points))
        # create a mask from the page polygon:
        page_image = image_from_polygon(page_image, page_polygon)
        # recrop into page rectangle:
        page_image = crop_image(page_image,
                                box=(page_xywh['x'], page_xywh['y'],
                                     page_xywh['x'] + page_xywh['w'],
                                     page_xywh['y'] + page_xywh['h']))
        if 'angle' in page_xywh and page_xywh['angle']:
            LOG.info("About to rotate page '%s' by %.2f°", page_id,
                     page_xywh['angle'])
            page_image = page_image.rotate(
                page_xywh['angle'],
                expand=True,
                #resample=Image.BILINEAR,
                fillcolor='white')
    # subtract offset from any increase in binary region size over source:
    page_xywh['x'] -= round(0.5 * max(0, page_image.width - page_xywh['w']))
    page_xywh['y'] -= round(0.5 * max(0, page_image.height - page_xywh['h']))
    return page_image, page_xywh, page_image_info