def _split_word_at_glyph(word, glyph): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh_glyph = xywh_from_points(glyph.get_Coords().points) xywh_word = xywh_from_points(word.get_Coords().points) xywh_prev = xywh_word.copy() xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh_word.copy() xywh_next.update({ 'x': xywh_glyph['x'] - xywh_glyph['w'], 'w': xywh_word['w'] - xywh_prev['w'] }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) glyphs = word.get_Glyph() pos = glyphs.index(glyph) prev_.set_Glyph(glyphs[0:pos]) next_.set_Glyph(glyphs[pos + 1:]) # TextEquiv: will be overwritten by page_update_higher_textequiv_levels return prev_, next_
def _split_word_at_space(word): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh = xywh_from_points(word.get_Coords().points) textequiv = word.get_TextEquiv()[0] pos = textequiv.Unicode.index(" ") fract = pos / len(textequiv.Unicode) xywh_prev = xywh.copy() xywh_prev.update({'w': xywh['w'] * fract}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh.copy() xywh_next.update({ 'x': xywh['x'] + xywh['w'] * fract, 'w': xywh['w'] * (1 - fract) }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) # Glyphs: irrelevant at this processing level textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos], conf=textequiv.conf) textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:], conf=textequiv.conf) prev_.set_TextEquiv([textequiv_prev]) next_.set_TextEquiv([textequiv_next]) return prev_, next_
def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def test_create_with_warning(self): ds = SeparatorRegionType(id='r6', Coords=CoordsType(points="239,1303 508,1303 899,1302 1626,1307 2441,1307 2444,1319 2414,1322 1664,1319 619,1317 235,1317 237,1302 235,1302")) with self.assertLogs('ocrd_browser.model.page_xml_renderer', level='WARNING') as log_watch: region = self.factory.create(ds) self.assertIsNotNone(region) self.assertRegex(log_watch.output[0], r'WARNING:ocrd_browser\.model\.page_xml_renderer\.RegionFactory:Page "DUMMY_0001" @ SeparatorRegion#r6 Self-intersection.+') self.assertRegex(region.warnings[0], r'Self-intersection.+')
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def _process_region(self, it, region, rogroup, region_image, region_coords): LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, index = 0 if rogroup: for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 while it and not it.Empty(RIL.BLOCK): bbox = it.BoundingBox(RIL.BLOCK) polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, region_image, region_coords) points = points_from_polygon(polygon) coords = CoordsType(points=points) # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (but ignore non-text regions entirely) ID = region.id + "_%04d" % index subregion = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) block_type = it.BlockType() if block_type == PT.FLOWING_TEXT: pass elif block_type == PT.HEADING_TEXT: subregion.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: subregion.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: subregion.set_type(TextTypeSimpleType.CAPTION) elif block_type == PT.VERTICAL_TEXT: subregion.set_orientation(90.0) else: it.Next(RIL.BLOCK) continue LOG.info("Detected cell '%s': %s (%s)", ID, points, membername(PT, block_type)) region.add_TextRegion(subregion) if rogroup: rogroup.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # iterator increment # index += 1 it.Next(RIL.BLOCK)
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): img_array = ocrolib.pil2array(page_image) # Check if image is RGB or not #FIXME: check not needed anymore? if len(img_array.shape) == 2: img_array = np.stack((img_array,)*3, axis=-1) img_array_bin = np.array( img_array > ocrolib.midrange(img_array), 'i') lineDetectH = [] lineDetectV = [] img_array_rr = self.remove_rular(img_array) textarea, img_array_rr_ta, height, width = self.detect_textarea( img_array_rr) colSeparator = int( width * self.parameter['colSeparator']) if len(textarea) > 1: textarea = self.crop_area( textarea, img_array_bin, img_array_rr_ta, colSeparator) if len(textarea) == 0: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) else: min_x, min_y, max_x, max_y = textarea[0] elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))): x1, y1, x2, y2 = textarea[0] x1 = x1-20 if x1 > 20 else 0 x2 = x2+20 if x2 < width-20 else width y1 = y1-40 if y1 > 40 else 0 y2 = y2+40 if y2 < height-40 else height min_x, min_y, max_x, max_y = textarea[0] else: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) border_points = points_from_polygon(border_polygon) brd = BorderType(Coords=CoordsType(border_points)) page.set_Border(brd) page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y)) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file(page_image, file_id + '-IMG', page_id=page_id, file_grp=self.output_file_grp) page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))
def test_create(self): ds = SeparatorRegionType(id='r6', Coords=CoordsType(points="0,0 0,1 1,1 1,0")) try: with self.assertLogs('ocrd_browser.model.page_xml_renderer', level='WARNING') as log_watch: region = self.factory.create(ds) raise ValueError('Dummy instead of assertNoLogs') except ValueError: pass self.assertEqual(len(log_watch.output), 0, '{:d} Warning(s) logged "{:s}'.format(len(log_watch.output), '\n'.join(log_watch.output))) self.assertIsInstance(region, Region) self.assertGreater(region.poly.area, 0)
def _process_words_in_line(self, result_it, line, line_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): word_no = 0 while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) LOG.debug("Decoding text in word '%s'", word_id) bbox = result_it.BoundingBox(RIL.WORD) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, line) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', points) else: line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=word_attributes['bold'] if 'bold' in word_attributes else None, italic=word_attributes['italic'] if 'italic' in word_attributes else None, underlined=word_attributes['underlined'] if 'underlined' in word_attributes else None, monospace=word_attributes['monospace'] if 'monospace' in word_attributes else None, serif=word_attributes['serif'] if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv(TextEquivType( Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) if self.parameter['textequiv_level'] != 'word': self._process_glyphs_in_word(result_it, word, line_xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: word_no += 1 result_it.Next(RIL.WORD)
def _process_words_in_line(self, line, maxlevel, result_it): for word_no in range( 0, MAX_ELEMENTS ): # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD) if not result_it: log.error("No iterator at '%s'", line.id) break if result_it.Empty(RIL.WORD): log.debug("No word here") break word_id = '%s_word%04d' % (line.id, word_no) log.debug("Recognizing text in word '%s'", word_id) word_bbox = result_it.BoundingBox(RIL.WORD) word = WordType(id=word_id, Coords=CoordsType(points_from_x0y0x1y1(word_bbox))) line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=None if 'bold' not in word_attributes else word_attributes['bold'], italic=None if 'italic' not in word_attributes else word_attributes['italic'], underlined=None if 'underlined' not in word_attributes else word_attributes['underlined'], monospace=None if 'monospace' not in word_attributes else word_attributes['monospace'], serif=None if 'serif' not in word_attributes else word_attributes['serif']) word.set_TextStyle( word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv( TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD) / 100)) if maxlevel == 'word': pass else: self._process_glyphs_in_word(word, result_it) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: result_it.Next(RIL.WORD)
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names): img_array = ocrolib.pil2array(page_image) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] page_xywh['features'] += ',blksegmented' for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] #small post-processing incase of paragrapgh to not cut last alphabets if (min_x - 5) > width and r['class_ids'][i] == 2: min_x -= 5 if (max_x + 10) < width and r['class_ids'][i] == 2: min_x += 10 # this can be tested, provided whether we need previous comments or not? region_img = img_array[min_x:max_x, min_y: max_y] #extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) coords = CoordsType( "%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) textregion = TextRegionType(Coords=coords, type_=class_names[r['class_ids'][i]]) textregion.add_AlternativeImage(ai) page.add_TextRegion(textregion)
def add_region(region: RectSegment, index: int, region_type: str): from ocrd_utils import coordinates_for_segment, points_from_polygon polygon = polygon_from_segment(region) polygon = coordinates_for_segment(polygon, page_image, page_coords) points = points_from_polygon(polygon) indexed_id = "region%04d" % index coords = CoordsType(points=points) if region_type == "text": page.add_TextRegion( TextRegionType(id=indexed_id, Coords=coords)) elif region_type == "image": page.add_ImageRegion( ImageRegionType(id=indexed_id, Coords=coords)) else: page.add_NoiseRegion( NoiseRegionType(id=indexed_id, Coords=coords))
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def add_region(region: Segment, index: int, type: str): indexed_id = "region%04d" % index points = str([ (region.x_start, region.y_start), (region.x_start, region.y_end), (region.x_end, region.y_start), (region.x_end, region.y_end), ]) coords = CoordsType(points=points) if type == "text": page.add_TextRegion( TextRegionType(id=indexed_id, Coords=coords)) elif type == "image": page.add_ImageRegion( ImageRegionType(id=indexed_id, Coords=coords)) else: page.add_NoiseRegion( NoiseRegionType(id=indexed_id, Coords=coords))
def _process_glyphs_in_word(self, result_it, word, word_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): glyph_no = 0 while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) bbox = result_it.BoundingBox(RIL.SYMBOL) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, word) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant glyph: %s', points) else: word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: glyph_no += 1 result_it.Next(RIL.SYMBOL)
def process(self): """ Segment with kraken """ log = getLogger('processor.KrakenSegment') for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) im = self.workspace.resolve_image_as_pil(image_url) log.info('Segmenting') log.info('Params %s', self.parameter) res = segment(im, self.parameter['text_direction'], self.parameter['scale'], self.parameter['maxcolseps'], self.parameter['black_colseps']) if self.parameter['script_detect']: res = detect_scripts(im, res) dummyRegion = TextRegionType() pcgts.get_Page().add_TextRegion(dummyRegion) # print(res) for lineno, box in enumerate(res['boxes']): textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType(points=points_from_x0y0x1y1(box))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))
def _process_page(page, page_image, page_xywh, pageId, file_id): settings = SegmentationSettings(debug=False, enable_preprocessing=False) # TODO: does this still need to be cropped or do we not need page_xywh? # Same for points below # page_image[page_xywh["x"]:page_xywh["w"], page_xywh["y"]:page_xywh["h"]] regions, classification = Segmentator(settings).segmentate_image( np.asarray(page_image)) count = 0 for region, prediction in zip(regions, classification): ID = "region%04d" % count points = str(list(region.exterior.coords)) coords = CoordsType(points=points) # FIXME: these are not all types in the model, also check if they match if prediction == 1: page.add_TextRegion(TextRegionType(id=ID, Coords=coords)) elif prediction == 2: page.add_ImageRegion(ImageRegionType(id=ID, Coords=coords)) else: page.add_NoiseRegion(NoiseRegionType(id=ID, Coords=coords)) count += 1
def _process_glyphs_in_word(self, word, result_it): for glyph_no in range( 0, MAX_ELEMENTS ): # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL) if not result_it: log.error("No iterator at '%s'", word.id) break if result_it.Empty(RIL.SYMBOL): log.debug("No glyph here") break glyph_id = '%s_glyph%04d' % (word.id, glyph_no) log.debug("Recognizing text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence( RIL.SYMBOL) / 100 # equals first choice? #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) glyph_bbox = result_it.BoundingBox(RIL.SYMBOL) glyph = GlyphType(id=glyph_id, Coords=CoordsType( points_from_x0y0x1y1(glyph_bbox))) word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence() / 100 #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv( TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: result_it.Next(RIL.SYMBOL)
def _merge_words(prev_, next_): merged = WordType(id=prev_.id + '.' + next_.id) merged.set_Coords( CoordsType(points=points_from_xywh( xywh_from_points(prev_.get_Coords().points + ' ' + next_.get_Coords().points)))) if prev_.get_language(): merged.set_language(prev_.get_language()) if prev_.get_TextStyle(): merged.set_TextStyle(prev_.get_TextStyle()) if prev_.get_Glyph() or next_.get_Glyph(): merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph()) if prev_.get_TextEquiv(): merged.set_TextEquiv(prev_.get_TextEquiv()) else: merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)]) if next_.get_TextEquiv(): textequiv = merged.get_TextEquiv()[0] textequiv2 = next_.get_TextEquiv()[0] textequiv.Unicode += textequiv2.Unicode if textequiv.conf and textequiv2.conf: textequiv.conf *= textequiv2.conf return merged
def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): """Set the identified page border, if valid.""" LOG = getLogger('processor.TesserocrCrop') left, top, right, bottom = bounds if left >= right or top >= bottom: LOG.error("Cannot find valid extent for page '%s'", page_id) return padding = self.parameter['padding'] # add padding: left = max(left - padding, 0) right = min(right + padding, page_image.width) top = max(top - padding, 0) bottom = min(bottom + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom) polygon = polygon_from_bbox(left, top, right, bottom) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) if polygon is None: LOG.error("Ignoring extant border") return border = BorderType(Coords=CoordsType( points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them border = None if page.get_TextRegion(): if self.parameter['overwrite']: LOG.info('removing existing TextRegions in page "%s"', page_id) page.set_TextRegion([]) else: LOG.warning('keeping existing TextRegions in page "%s"', page_id) return # check if border exists if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border = Polygon(border_points) # page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh) img_array = ocrolib.pil2array(page_image) page_image.save('./checkthis.png') if len(img_array.shape) <= 2: img_array = np.stack((img_array, ) * 3, axis=-1) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] th = self.parameter['th'] # check for existing semgentation mask # this code executes only when use_deeplr is set to True in ocrd-tool.json file if mask: mask = ocrolib.pil2array(mask) mask = mask // 255 mask = 1 - mask # multiply all the bounding box part with 2 for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] mask[min_x:max_x, min_y:max_y] *= i + 2 cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2))) # check for left over pixels and add them to the bounding boxes pixel_added = True while pixel_added: pixel_added = False left_over = np.where(mask == 1) for x, y in zip(left_over[0], left_over[1]): local_mask = mask[x - th:x + th, y - th:y + th] candidates = np.where(local_mask > 1) candidates = [k for k in zip(candidates[0], candidates[1])] if len(candidates) > 0: pixel_added = True # find closest pixel with x>1 candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 + (j[1] - th)**2)) index = local_mask[candidates[0]] - 2 # add pixel to mask/bbox # x,y to bbox with index if x < r['rois'][index][0]: r['rois'][index][0] = x elif x > r['rois'][index][2]: r['rois'][index][2] = x if y < r['rois'][index][1]: r['rois'][index][1] = y elif y > r['rois'][index][3]: r['rois'][index][3] = y # update the mask mask[x, y] = index + 2 # resolving overlapping problem bbox_dict = {} # to check any overlapping bbox class_id_check = [] for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] region_bbox = [min_y, min_x, max_y, max_x] for key in bbox_dict: for bbox in bbox_dict[key]: # checking for ymax case with vertical overlapping # along with y, check both for xmax and xmin if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][2] = bbox[1] - 1 # checking for ymin now # along with y, check both for xmax and xmin if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][0] = bbox[3] + 1 if r['class_ids'][i] not in class_id_check: bbox_dict[r['class_ids'][i]] = [] class_id_check.append(r['class_ids'][i]) bbox_dict[r['class_ids'][i]].append(region_bbox) # resolving overlapping problem code # define reading order on basis of coordinates reading_order = [] for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 reading_order.append((min_y, min_x, max_y, max_x)) reading_order = sorted(reading_order, key=lambda reading_order: (reading_order[1], reading_order[0])) for i in range(len(reading_order)): min_y, min_x, max_y, max_x = reading_order[i] min_y = 0 i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) for j in range(i + 1, len(reading_order)): min_y, min_x, max_y, max_x = reading_order[j] j_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) inter = i_poly.intersection(j_poly) if inter: reading_order.insert(j + 1, reading_order[i]) del reading_order[i] # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] if border: cut_region_polygon = border.intersection( Polygon(region_polygon)) if cut_region_polygon.is_empty: continue else: cut_region_polygon = Polygon(region_polygon) order_index = reading_order.index((min_y, min_x, max_y, max_x)) region_id = '%s_region%04d' % (page_id, i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_id) order_group.add_RegionRefIndexed(regionRefIndex) reading_order_object = ReadingOrderType() reading_order_object.set_OrderedGroup(order_group) page.set_ReadingOrder(reading_order_object) for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 # one change here to resolve flipped coordinates region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] cut_region_polygon = border.intersection(Polygon(region_polygon)) if cut_region_polygon.is_empty: continue cut_region_polygon = [ j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1])) ][:-1] # checking whether coordinates are flipped region_polygon = coordinates_for_segment(cut_region_polygon, page_image, page_xywh) region_points = points_from_polygon(region_polygon) read_order = reading_order.index((min_y, min_x, max_y, max_x)) # this can be tested, provided whether we need previous comments or not? # resolving overlapping problem region_img = img_array[min_x:max_x, min_y: max_y] # extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) region_id = '%s_region%04d' % (page_id, i) coords = CoordsType(region_points) # incase of imageRegion if r['class_ids'][i] == 15: image_region = ImageRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # image_region.add_AlternativeImage(ai) page.add_ImageRegion(image_region) continue if r['class_ids'][i] == 16: table_region = TableRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # table_region.add_AlternativeImage(ai) page.add_TableRegion(table_region) continue if r['class_ids'][i] == 17: graphic_region = GraphicRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # graphic_region.add_AlternativeImage(ai) page.add_GraphicRegion(graphic_region) continue textregion = TextRegionType(custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # textregion.add_AlternativeImage(ai) #border = page.get_Border() # if border: # border.add_TextRegion(textregion) # else: page.add_TextRegion(textregion)
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrSegmentWord') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_words = self.parameter['overwrite_words'] with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in page.get_TextRegion(): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) for line in region.get_TextLine(): if line.get_Word(): if overwrite_words: LOG.info('removing existing Words in line "%s"', line.id) line.set_Word([]) else: LOG.warning('keeping existing Words in line "%s"', line.id) LOG.debug("Detecting words in line '%s'", line.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) tessapi.SetImage(line_image) for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) word_polygon2 = polygon_for_parent(word_polygon, line) if word_polygon2 is not None: word_polygon = word_polygon2 word_points = points_from_polygon(word_polygon) if word_polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', word_points) continue line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def convert(cocofile, directory): """Convert MS-COCO JSON to METS/PAGE XML files. Load JSON ``cocofile`` (in MS-COCO format) and chdir to ``directory`` (which it refers to). Start a METS file mets.xml with references to the image files (under fileGrp ``OCR-D-IMG``) and their corresponding PAGE-XML annotations (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as parsed from ``cocofile`` and written using the same basename. """ resolver = Resolver() with pushd_popd(directory): workspace = resolver.workspace_from_nothing('.') # https://github.com/ibm-aur-nlp/PubLayNet workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory coco = json.load(cocofile) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) categories = dict() for cat in coco['categories']: categories[cat['id']] = cat['name'] images = dict() for image in coco['images']: images[image['id']] = image for annotation in coco['annotations']: image = images[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Parsing annotations into PAGE-XML') for image in images.values(): page_id = 'p' + str(image['id']) file_base, file_ext = os.path.splitext(image['file_name']) filename = file_base + '.xml' image_file = workspace.add_file('OCR-D-IMG', ID='OCR-D-IMG_' + page_id, pageId=page_id, mimetype=EXT_TO_MIME[file_ext], local_filename=image['file_name']) LOG.info('Added page %s file %s of type %s', image_file.pageId, image_file.local_filename, image_file.mimetype) pcgts = page_from_image(image_file) pcgts.set_pcGtsId(page_id) page = pcgts.get_Page() assert page.imageWidth == image['width'] assert page.imageHeight == image['height'] for region in image['regions']: polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] region_id = 'r' + str(region['id']) if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown image category: %s' % category) page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK', ID='OCR-D-GT-SEG-BLOCK_' + page_id, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=filename, content=to_xml(pcgts)) LOG.info('Added page %s file %s with %d regions', page_file.pageId, page_file.local_filename, len(image['regions'])) LOG.info('All done') workspace.save_mets()
def process(self): for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename img = self.workspace.resolve_image_as_pil(fname) #fname = str(fname) print("Process file: ", fname) base, _ = ocrolib.allsplitext(fname) img_array = ocrolib.pil2array(img) img_array_bin = np.array(img_array > ocrolib.midrange(img_array), 'i') lineDetectH = [] lineDetectV = [] img_array_rr = self.remove_rular(img_array) textarea, img_array_rr_ta, height, width = self.detect_textarea( img_array_rr) self.parameter['colSeparator'] = int( width * self.parameter['colSeparator']) if len(textarea) > 1: textarea = self.crop_area(textarea, img_array_bin, img_array_rr_ta) if len(textarea) == 0: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) else: min_x, min_y, max_x, max_y = textarea[0] elif len(textarea) == 1 and ( height * width * 0.5 < (abs(textarea[0][2] - textarea[0][0]) * abs(textarea[0][3] - textarea[0][1]))): x1, y1, x2, y2 = textarea[0] x1 = x1 - 20 if x1 > 20 else 0 x2 = x2 + 20 if x2 < width - 20 else width y1 = y1 - 40 if y1 > 40 else 0 y2 = y2 + 40 if y2 < height - 40 else height #self.save_pf(base, [x1, y1, x2, y2]) min_x, min_y, max_x, max_y = textarea[0] else: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process_lines(self, textlines, predfiles, fgrp, regionid): for line in textlines: for file in predfiles: if file == '-'.join([fgrp, regionid, line.id]): self.log.info("Processing text in line '%s'", line.id) filepath = self.root + '/' + file + '.json' with open(filepath) as f: data = json.load(f) linepred = data['predictions'][0]['sentence'] line_conf = [] line_pos = [] w = '' word_conf = [] words = [] word_pos = [] positions = data['predictions'][0]['positions'] for i, d in enumerate(positions): char = d['chars'][0]['char'] char_conf = d['chars'][0]['probability'] char_pos = (d['globalStart'], d['globalEnd']) if char == ' ': words.append(w) w = '' line_conf.append(word_conf) word_conf = [] line_pos.append(word_pos) word_pos = [] else: w += char word_conf.append(char_conf) word_pos.append(char_pos) if i == len(positions) - 1: words.append(w) line_conf.append(word_conf) line_pos.append(word_pos) wconfs = [(min(conf) + max(conf)) / 2 for conf in line_conf] lineconf = (min(wconfs) + max(wconfs)) / 2 line.replace_TextEquiv_at( 0, TextEquivType(Unicode=linepred, conf=str(lineconf))) if self.maxlevel == 'word' or 'glyph': box = bounding_box(line.get_Coords().points) line.Word = [] for w_no, w in enumerate(words): # Coords of word wordbounding = (line_pos[w_no][0][0], line_pos[w_no][-1][-1]) word_bbox = [ box[0] + wordbounding[0], box[1], box[2] + wordbounding[1], box[3] ] word_id = '%s_word%04d' % (line.id, w_no) word = WordType( id=word_id, Coords=CoordsType( points_from_x0y0x1y1(word_bbox))) line.add_Word(word) word.add_TextEquiv( TextEquivType(Unicode=w, conf=str(wconfs[w_no]))) if self.maxlevel == 'glyph': for glyph_no, g in enumerate(w): glyphbounding = ( line_pos[w_no][glyph_no][0], line_pos[w_no][glyph_no][-1]) glyph_bbox = [ box[0] + glyphbounding[0], box[1], box[2] + glyphbounding[1], box[3] ] glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType( id=glyph_id, Coords=CoordsType( points_from_x0y0x1y1( glyph_bbox))) word.add_Glyph(glyph) glyph.add_TextEquiv( TextEquivType( Unicode=g, conf=str(line_conf[w_no] [glyph_no])))
def process(self): """ Perform text recognition with Calamari on the workspace. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector=self.features) for region in page.get_AllRegions(classes=['Text']): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) line_images_np = [] line_coordss = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.network_input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: log.warning( "Using raw image for line '%s' in region '%s'", line.id, region.id) line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) raw_results_all = self.predictor.predict_raw( line_images_np, progress_bar=False) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all): for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0): padding = self.parameter['padding'] img_array = pil2array(page_image) # ensure RGB image if len(img_array.shape) == 2: img_array = np.stack((img_array, ) * 3, axis=-1) height, width, _ = img_array.shape size = height * width # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300) if zoom != 1.0: self.logger.info("scaling %dx%d image by %.2f", width, height, zoom) img_array = cv2.resize(img_array, None, fx=zoom, fy=zoom, interpolation=cv2.INTER_CUBIC) # detect rule placed in image next to page for scale reference: mask_array, mask_box = self.detect_ruler(img_array) # detect page frame via line segment detector: border_polygon, prefer_border = self.select_borderLine( img_array, mask_box) border_polygon = np.array(border_polygon) / zoom # unzoom # pad inwards: border_polygon = Polygon(border_polygon).buffer( -padding).exterior.coords[:-1] # get the bounding box from the border polygon: # min_x, min_y = border_polygon.min(axis=0) # max_x, max_y = border_polygon.max(axis=0) # get the inner rectangle from the border polygon: # _, min_x, max_x, _ = np.sort(border_polygon[:,0]) # _, min_y, max_y, _ = np.sort(border_polygon[:,1]) if prefer_border: self.logger.info("Preferring line detector") else: self.logger.info("Falling back to text detector") textboxes = self.detect_textboxes(img_array, mask_array) if len(textboxes) > 1: textboxes = self.merge_boxes(textboxes, img_array) textboxes = np.array(textboxes) / zoom # unzoom if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size < self.get_area(textboxes[0])): self.logger.info("Using text area (%d%% area)", 100 * self.get_area(textboxes[0]) / size) min_x, min_y, max_x, max_y = textboxes[0] # pad outwards border_polygon = polygon_from_bbox(min_x - padding, min_y - padding, max_x + padding, max_y + padding) def clip(point): x, y = point x = max(0, min(page_image.width, x)) y = max(0, min(page_image.height, y)) return x, y border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) border_polygon = list(map(clip, border_polygon)) border_points = points_from_polygon(border_polygon) border = BorderType(Coords=CoordsType(border_points)) page.set_Border(border) # get clipped relative coordinates for current image page_image, page_xywh, _ = self.workspace.image_from_page( page, input_file.pageId, fill='background', transparency=True) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))