def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): """ Performs the binarization. """ log = getLogger('processor.KrakenBinarize') log.debug('Level of operation: "%s"', self.parameter['level-of-operation']) log.debug('Input file group %s', self.input_file_grp) log.debug('Input files %s', [str(f) for f in self.input_files]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) if self.parameter['level-of-operation'] == 'page': log.info("About to binarize page '%s'", pcgts.pcGtsId) image = self.workspace.resolve_image_as_pil(image_url) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype='image/png', local_filename="%s/%s" % (self.output_file_grp, ID), content=bin_image_bytes.getvalue()) else: for region in pcgts.get_Page().get_TextRegion(): if self.parameter['level-of-operation'] == 'block': log.info("About to binarize region '%s'", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) else: textlines = region.get_TextLine() log.info("About to binarize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Binarizing line '%s' in region '%s'", line_no, region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(line.get_Coords().points)) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n, region.id, line_no) self.workspace.add_file( self.output_file_grp, pageId=input_file.pageId, ID=ID, local_filename="%s/%s" % (self.output_file_grp, ID), mimetype='image/png', content=bin_image_bytes.getvalue())
def file_id(self, file_grp): file_id = self.input_file.ID.replace(self.input_file_grp, file_grp) if file_id == self.input_file.ID: file_id = concat_padded(file_grp, self.page_num) return file_id
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the binarization. """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to binarize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Binarizing line '%s' in region '%s'", line_no, region.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) print(dir(kraken.binarization)) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( self.output_file_grp, pageId=input_file.pageId, ID=ID, basename="%s.bin.png" % ID, mimetype='image/png', content=bin_image_bytes.getvalue())
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def _process_segment(self, model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n): for i, data in enumerate(dataset): w, h = orig_img_size generated = model.inference(data['label'], data['inst'], data['image']) dewarped = array(generated.data[0].permute(1, 2, 0).detach().cpu()) bin_array = array(255 * (dewarped > ocrolib.midrange(dewarped)), 'B') dewarped = ocrolib.array2pil(bin_array) dewarped = dewarped.resize((w, h)) page_xywh['features'] += ',dewarped' file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file( dewarped, file_id, page_id=page_id, file_grp=self.image_grp, force=self.parameter['force']) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): network_file = self.parameter['network'] stride = self.parameter['stride'] classifier = TypegroupsClassifier.load(network_file) ignore_type = ('Adornment', 'Book covers and other irrelevant data', 'Empty Pages', 'Woodcuts - Engravings') self.log.debug('Processing: %s', self.input_files) for (_, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename pil_image = self.workspace.resolve_image_as_pil(image_url) result = classifier.run(pil_image, stride) score_sum = 0 for typegroup in classifier.classMap.cl2id: if not typegroup in ignore_type: score_sum += max(0, result[typegroup]) script_highscore = 0 noise_highscore = 0 result_map = {} output = '' for typegroup in classifier.classMap.cl2id: score = result[typegroup] if typegroup in ignore_type: noise_highscore = max(noise_highscore, score) else: script_highscore = max(script_highscore, score) normalised_score = max(0, score / score_sum) result_map[normalised_score] = typegroup if noise_highscore > script_highscore: pcgts.get_Page().set_primaryScript(None) self.log.debug( 'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s', noise_highscore, script_highscore) else: for k in sorted(result_map, reverse=True): intk = round(100 * k) if intk <= 0: continue if output != '': output = '%s, ' % output output = '%s%s:%d' % (output, result_map[k], intk) self.log.debug('Detected %s' % output) page = pcgts.get_Page() textStyle = page.get_TextStyle() if not textStyle: textStyle = TextStyleType() page.set_TextStyle(textStyle) textStyle.set_fontFamily(output) ID = concat_padded(self.output_file_grp, input_file.ID) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def file_id(self): file_id = self.input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == self.input_file.ID: file_id = ocrd_utils.concat_padded(self.output_file_grp, self.page_num) return file_id
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) self.maxlevel = self.parameter['textequiv_level'] linesdir = self.parameter['linesdir'] if self.maxlevel not in ['line', 'word', 'glyph']: raise Exception( "currently only implemented at the line/glyph level") root, _, files = os.walk(linesdir).__next__() self.root = root predfiles = [] for file in files: if '.pred' in file: predfiles.append(file[:-9]) ######################################################################################## # self.log.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): # self.log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId()) page = pcgts.get_Page() index = input_file.url.rfind('/') + 1 fgrp = input_file.url[index:-4] # region, line, word, or glyph level: regions = page.get_TextRegion() if not regions: self.log.warning("Page contains no text regions") self.process_regions(regions, predfiles, fgrp) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def process(self): """ Segment with kraken """ log = getLogger('processor.KrakenSegment') for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) im = self.workspace.resolve_image_as_pil(image_url) log.info('Segmenting') log.info('Params %s', self.parameter) res = segment(im, self.parameter['text_direction'], self.parameter['scale'], self.parameter['maxcolseps'], self.parameter['black_colseps']) if self.parameter['script_detect']: res = detect_scripts(im, res) dummyRegion = TextRegionType() pcgts.get_Page().add_TextRegion(dummyRegion) # print(res) for lineno, box in enumerate(res['boxes']): textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType(points=points_from_x0y0x1y1(box))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if oplevel == "page": self._process_segment(page, page_image.filename, page_id, file_id + ".ds") file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """ Performs the (text) recognition. """ mainIndex = self.parameter['mainIndex'] for (n, input_file) in enumerate(self.input_files): alignurl = input_file.url pcgts = parse(alignurl, True) page = pcgts.get_Page() regions = page.get_TextRegion() pagecontent = '' for region in regions: regioncontent = '' lines = region.get_TextLine() for line in lines: linecontent = '' words = line.get_Word() for word in words: wordunicode = word.get_TextEquiv()[mainIndex].Unicode word.add_TextEquiv(TextEquivType(Unicode=wordunicode)) linecontent += ' ' + wordunicode line.add_TextEquiv(TextEquivType(Unicode=regioncontent)) regioncontent += '\n' + linecontent region.add_TextEquiv(TextEquivType(Unicode=regioncontent)) pagecontent += '\n' + regioncontent page.add_TextEquiv(TextEquivType(Unicode=pagecontent)) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names): img_array = ocrolib.pil2array(page_image) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] page_xywh['features'] += ',blksegmented' for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] #small post-processing incase of paragrapgh to not cut last alphabets if (min_x - 5) > width and r['class_ids'][i] == 2: min_x -= 5 if (max_x + 10) < width and r['class_ids'][i] == 2: min_x += 10 # this can be tested, provided whether we need previous comments or not? region_img = img_array[min_x:max_x, min_y: max_y] #extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) coords = CoordsType( "%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) textregion = TextRegionType(Coords=coords, type_=class_names[r['class_ids'][i]]) textregion.add_AlternativeImage(ai) page.add_TextRegion(textregion)
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file) local_input_file = self.workspace.download_file(input_file) pcgts = parse(local_input_file.url, silence=True) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), self.parameter['textequiv_level']) self._process_page(pcgts) # write back result file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): I = ocrolib.pil2array(page_image) if len(I.shape) > 2: I = np.mean(I, 2) I = 1 - I / I.max() rows, cols = I.shape # Generate Mask and Seed Images Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I) # Iseedfill: Union of Mask and Seed Images Iseedfill = self.pixSeedfillBinary(Imask, Iseed) # Dilation of Iseedfill mask = ones((3, 3)) Iseedfill = ndimage.binary_dilation(Iseedfill, mask) # Expansion of Iseedfill to become equal in size of I Iseedfill = self.expansion(Iseedfill, (rows, cols)) # Write Text and Non-Text images image_part = array((1 - I * Iseedfill), dtype=int) image_part[0, 0] = 0 # only for visualisation purpose text_part = array((1 - I * (1 - Iseedfill)), dtype=int) text_part[0, 0] = 0 # only for visualisation purpose page_xywh['features'] += ',tiseged' bin_array = array(255 * (text_part > ocrolib.midrange(text_part)), 'B') bin_image = ocrolib.array2pil(bin_array) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(bin_image, file_id, page_id=page_id, file_grp=self.image_grp) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))
def process(self): for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() regions = page.get_TextRegion() for region in regions: if region.readingDirection != 'left-to-right': LOG.info('Not processing region "%s" (not left-to-right)', region.id) continue if len(region.get_TextLine() ) > 1 and region.textLineOrder != 'top-to-bottom': LOG.info('Not processing region "%s" (not top-to-bottom)', region.id) continue _fix_lines(region) lines = region.get_TextLine() for line in lines: _fix_words(line) words = line.get_Word() for word in words: _fix_glyphs(word) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename img = self.workspace.resolve_image_as_pil(fname) #fname = str(fname) print("Process file: ", fname) base, _ = ocrolib.allsplitext(fname) img_array = ocrolib.pil2array(img) img_array_bin = np.array(img_array > ocrolib.midrange(img_array), 'i') lineDetectH = [] lineDetectV = [] img_array_rr = self.remove_rular(img_array) textarea, img_array_rr_ta, height, width = self.detect_textarea( img_array_rr) self.parameter['colSeparator'] = int( width * self.parameter['colSeparator']) if len(textarea) > 1: textarea = self.crop_area(textarea, img_array_bin, img_array_rr_ta) if len(textarea) == 0: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) else: min_x, min_y, max_x, max_y = textarea[0] elif len(textarea) == 1 and ( height * width * 0.5 < (abs(textarea[0][2] - textarea[0][0]) * abs(textarea[0][3] - textarea[0][1]))): x1, y1, x2, y2 = textarea[0] x1 = x1 - 20 if x1 > 20 else 0 x2 = x2 + 20 if x2 < width - 20 else width y1 = y1 - 40 if y1 > 40 else 0 y2 = y2 + 40 if y2 < height - 40 else height #self.save_pf(base, [x1, y1, x2, y2]) min_x, min_y, max_x, max_y = textarea[0] else: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): raw = ocrolib.pil2array(page_image) flat = raw.astype("float64") # estimate skew angle and rotate if self.parameter['maxskew'] > 0: if self.parameter['parallel'] < 2: LOG.info("Estimating Skew Angle") d0, d1 = flat.shape o0, o1 = int(self.parameter['bignore'] * d0), int( self.parameter['bignore'] * d1) flat = amax(flat) - flat flat -= amin(flat) est = flat[o0:d0 - o0, o1:d1 - o1] ma = self.parameter['maxskew'] ms = int(2 * self.parameter['maxskew'] * self.parameter['skewsteps']) angle = self.estimate_skew_angle(est, linspace(-ma, ma, ms + 1)) flat = interpolation.rotate(flat, angle, mode='constant', reshape=0) flat = amax(flat) - flat else: angle = 0 # self.write_angles_to_pageXML(base,angle) # estimate low and high thresholds if self.parameter['parallel'] < 2: LOG.info("Estimating Thresholds") d0, d1 = flat.shape o0, o1 = int(self.parameter['bignore'] * d0), int( self.parameter['bignore'] * d1) est = flat[o0:d0 - o0, o1:d1 - o1] if self.parameter['escale'] > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = self.parameter['escale'] v = est - filters.gaussian_filter(est, e * 20.0) v = filters.gaussian_filter(v**2, e * 20.0)**0.5 v = (v > 0.3 * amax(v)) v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1))) v = morphology.binary_dilation(v, structure=ones((1, int(e * 50)))) if self.parameter['debug'] > 0: imshow(v) ginput(1, self.parameter['debug']) est = est[v] lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo']) hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi']) # rescale the image to get the gray scale image if self.parameter['parallel'] < 2: LOG.info("Rescaling") flat -= lo flat /= (hi - lo) flat = clip(flat, 0, 1) if self.parameter['debug'] > 0: imshow(flat, vmin=0, vmax=1) ginput(1, self.parameter['debug']) deskewed = 1 * (flat > self.parameter['threshold']) # output the normalized grayscale and the thresholded images #LOG.info("%s lo-hi (%.2f %.2f) angle %4.1f" %(lo, hi, angle)) #TODO: Need some clarification as the results effect the following pre-processing steps. #orientation = -angle #orientation = 180 - ((180 - orientation) % 360) if angle is None: # FIXME: quick fix to prevent angle of "none" angle = 0 page.set_orientation(angle) page_xywh['features'] += ',deskewed' bin_array = array(255 * (deskewed > ocrolib.midrange(deskewed)), 'B') page_image = ocrolib.array2pil(bin_array) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(page_image, file_id, page_id=page_id, file_grp=self.image_grp) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) # todo: populate GetChoiceIterator() with LSTM models, too: #tessapi.SetVariable("lstm_choice_mode", "2") # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file( self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] ['steps'][0], value='ocrd-tesserocr-recognize', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) regions = pcgts.get_Page().get_TextRegion() if not regions: log.warning("Page contains no text regions") self._process_regions(regions, maxlevel, tessapi) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts), )
def process(self): try: page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) if not torch.cuda.is_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") sys.exit(1) path = self.parameter['pix2pixHD'] if not Path(path).is_dir(): LOG.error("""\ NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter in ocrd-tools.json points to the local path to the cloned pix2pixHD repository. pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD """ % path) sys.exit(1) model_file_path = os.path.join(path, 'checkpoints/latest_net_G.pth') if not Path(model_file_path).is_file(): LOG.error("""\ pix2pixHD model file was not found at '%s'. Make sure the this file exists. """ % model_file_path) sys.exit(1) opt, model = self.prepare_options(path) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s", page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='dewarped', feature_selector='binarized' ) # images should be deskewed and cropped if oplevel == 'page': dataset = self.prepare_data(opt, page_image, path) orig_img_size = page_image.size self._process_segment(model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n) else: regions = page.get_TextRegion() + page.get_TableRegion( ) #get all regions? if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions # TODO: region has to exist as a physical file to be processed by pix2pixHD dataset = self.prepare_data(opt, region_image, path) orig_img_size = region_image.size self._process_segment(model, dataset, page, region_xywh, region.id, input_file, orig_img_size, n) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, page_grp) if file_id == input_file.ID: file_id = concat_padded(page_grp, n) self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( page_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force']) os.rmdir(self.input_file_grp + "/test_A/") #FIXME: better way of deleting a temp_dir?
def process(self): """Extract region images from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Extract an image for each region (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. Create a JSON file with: * the IDs of the region and its parents, * the region's coordinates relative to the region image, * the region's absolute coordinates, * the (text) region's text content (if any), * the (text) region's TextStyle (if any), * the (text) region's @production (if any), * the (text) region's @readingDirection (if any), * the (text) region's @textLineOrder (if any), * the (text) region's @primaryScript (if any), * the (text) region's @primaryLanguage (if any), * the region's AlternativeImage/@comments (features), * the region's element class, * the region's @type, * the page's @type, * the page's DPI value. Write all files in the directory of the output file group, named like so: * ID + '.raw.png': region image (if the workflow provides raw images) * ID + '.bin.png': region image (if the workflow provides binarized images) * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images) * ID + '.json': region metadata. """ # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None ptype = page.get_type() regions = { 'advert': page.get_AdvertRegion(), 'text': page.get_TextRegion(), 'table': page.get_TableRegion(), 'chart': page.get_ChartRegion(), 'chem': page.get_ChemRegion(), 'graphic': page.get_GraphicRegion(), 'image': page.get_ImageRegion(), 'linedrawing': page.get_LineDrawingRegion(), 'maths': page.get_MathsRegion(), 'music': page.get_MusicRegion(), 'noise': page.get_NoiseRegion(), 'separator': page.get_SeparatorRegion(), 'unknown': page.get_UnknownRegion() } for rtype, rlist in regions.items(): for region in rlist: description = { 'region.ID': region.id, 'region.type': rtype } region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, transparency=self.parameter['transparency']) description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None description['coords_rel'] = coordinates_of_segment( region, region_image, region_coords).tolist() description['coords_abs'] = polygon_from_points(region.get_Coords().points) if rtype == 'text': rtext = region.get_TextEquiv() if rtext: description['region.text'] = rtext[0].Unicode else: description['region.text'] = '' rstyle = region.get_TextStyle() or page.get_TextStyle() if rstyle: description['region.style'] = { 'fontFamily': rstyle.fontFamily, 'fontSize': rstyle.fontSize, 'xHeight': rstyle.xHeight, 'kerning': rstyle.kerning, 'serif': rstyle.serif, 'monospace': rstyle.monospace, 'bold': rstyle.bold, 'italic': rstyle.italic, 'smallCaps': rstyle.smallCaps, 'letterSpaced': rstyle.letterSpaced, 'strikethrough': rstyle.strikethrough, 'underlined': rstyle.underlined, 'underlineStyle': rstyle.underlineStyle, 'subscript': rstyle.subscript, 'superscript': rstyle.superscript } description['production'] = region.get_production() description['readingDirection'] = ( region.get_readingDirection() or page.get_readingDirection()) description['textLineOrder'] = ( region.get_textLineOrder() or page.get_textLineOrder()) description['primaryScript'] = ( region.get_primaryScript() or page.get_primaryScript()) description['primaryLanguage'] = ( region.get_primaryLanguage() or page.get_primaryLanguage()) description['features'] = region_coords['features'] description['DPI']= dpi description['page.ID'] = page_id description['page.type'] = ptype description['file_grp'] = self.input_file_grp description['METS.UID'] = self.workspace.mets.unique_identifier if 'binarized' in region_coords['features']: extension = '.bin' elif 'grayscale_normalized' in region_coords['features']: extension = '.nrm' else: extension = '.raw' file_path = self.workspace.save_image_file( region_image, file_id + '_' + region.id + extension, self.output_file_grp, page_id=page_id, format='PNG') file_path = file_path.replace(extension + '.png', '.json') json.dump(description, open(file_path, 'w'))
def test_concat_padded(self): self.assertEqual(concat_padded('x', 1), 'x_0001') self.assertEqual(concat_padded('x', 1, 2, 3), 'x_0001_0002_0003') self.assertEqual(concat_padded('x', 1, '2', 3), 'x_0001_2_0003')
def process(self): for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID page = pcgts.get_Page() # why does it save the image ?? page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id) if self.parameter['parallel'] < 2: LOG.info("INPUT FILE %s ", input_file.pageId or input_file.ID) raw = ocrolib.read_image_gray(page_image.filename) flat = raw #flat = np.array(binImg) # estimate skew angle and rotate if self.parameter['maxskew'] > 0: if self.parameter['parallel'] < 2: LOG.info("Estimating Skew Angle") d0, d1 = flat.shape o0, o1 = int(self.parameter['bignore'] * d0), int( self.parameter['bignore'] * d1) flat = amax(flat) - flat flat -= amin(flat) est = flat[o0:d0 - o0, o1:d1 - o1] ma = self.parameter['maxskew'] ms = int(2 * self.parameter['maxskew'] * self.parameter['skewsteps']) angle = self.estimate_skew_angle(est, linspace(-ma, ma, ms + 1)) flat = interpolation.rotate(flat, angle, mode='constant', reshape=0) flat = amax(flat) - flat else: angle = 0 # self.write_angles_to_pageXML(base,angle) # estimate low and high thresholds if self.parameter['parallel'] < 2: LOG.info("Estimating Thresholds") d0, d1 = flat.shape o0, o1 = int(self.parameter['bignore'] * d0), int( self.parameter['bignore'] * d1) est = flat[o0:d0 - o0, o1:d1 - o1] if self.parameter['escale'] > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = self.parameter['escale'] v = est - filters.gaussian_filter(est, e * 20.0) v = filters.gaussian_filter(v**2, e * 20.0)**0.5 v = (v > 0.3 * amax(v)) v = morphology.binary_dilation(v, structure=ones( (int(e * 50), 1))) v = morphology.binary_dilation(v, structure=ones( (1, int(e * 50)))) if self.parameter['debug'] > 0: imshow(v) ginput(1, self.parameter['debug']) est = est[v] lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo']) hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi']) # rescale the image to get the gray scale image if self.parameter['parallel'] < 2: LOG.info("Rescaling") flat -= lo flat /= (hi - lo) flat = clip(flat, 0, 1) if self.parameter['debug'] > 0: imshow(flat, vmin=0, vmax=1) ginput(1, self.parameter['debug']) deskewed = 1 * (flat > self.parameter['threshold']) # output the normalized grayscale and the thresholded images LOG.info("%s lo-hi (%.2f %.2f) angle %4.1f" % (pcgts.get_Page().imageFilename, lo, hi, angle)) if self.parameter['parallel'] < 2: LOG.info("Writing") #ocrolib.write_image_binary(base+".ds.png", deskewed) #TODO: Need some clarification as the results effect the following pre-processing steps. #orientation = -angle #orientation = 180 - ((180 - orientation) % 360) pcgts.get_Page().set_orientation(angle) #print(orientation, angle) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Performs segmentation evaluation with Shapely on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Return information on the plausibility of the segmentation into regions on the logging level. """ plausibilize = self.parameter['plausibilize'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this # what we want here is `externalModel="ocrd-tool" externalId="parameters"` Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() regions = page.get_TextRegion() mark_for_deletion = set() mark_for_merging = set() for i in range(0, len(regions)): for j in range(i + 1, len(regions)): LOG.info('Comparing regions "%s" and "%s"', regions[i].id, regions[j].id) region_poly1 = Polygon( polygon_from_points(regions[i].get_Coords().points)) region_poly2 = Polygon( polygon_from_points(regions[j].get_Coords().points)) LOG.debug('Checking for equality ...') equality = region_poly1.almost_equals(region_poly2) if equality: LOG.warn( 'Warning: regions %s and %s cover the same area.' % (regions[i].id, regions[j].id)) mark_for_deletion.add(j) LOG.debug('Checking for containment ...') containment_r = region_poly1.contains(region_poly2) containment_l = region_poly2.contains(region_poly1) if containment_r: LOG.warn('Warning: %s contains %s' % (regions[i].id, regions[j].id)) mark_for_deletion.add(j) if containment_l: LOG.warn('Warning: %s contains %s' % (regions[j].id, regions[i].id)) mark_for_deletion.add(i) if plausibilize: new_regions = [] for i in range(0, len(regions)): if not i in mark_for_deletion: new_regions.append(regions[i]) page.set_TextRegion(new_regions) #LOG.info('Intersection %i', region_poly1.intersects(region_poly2)) #LOG.info('Containment %i', region_poly1.contains(region_poly2)) #if region_poly1.intersects(region_poly2): # LOG.info('Area 1 %d', region_poly1.area) # LOG.info('Area 2 %d', region_poly2.area) # LOG.info('Area intersect %d', region_poly1.intersection(region_poly2).area) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def test_concat_padded(self): self.assertEqual(concat_padded('x', 0), 'x_0001') self.assertEqual(concat_padded('x', 0, 1, 2), 'x_0001_0002_0003') self.assertEqual(concat_padded('x', 0, '1', 2), 'x_0001_1_0003')
def process(self): """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices). ... explain incremental page-wise processing here ... """ level = self.parameter['textequiv_level'] beam_width = self.parameter['beam_width'] lm_weight = self.parameter['lm_weight'] prev_traceback = None prev_pcgts = None prev_file_id = None prev_page_id = None for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), level) # annotate processing metadata: metadata = pcgts.get_Metadata() # ensured by page_from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0], value='ocrd-keraslm-rate', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # context preprocessing: # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods), # get global context variables from there (e.g. originInfo/dateIssued/@text for year) ident = self.workspace.mets.unique_identifier # at least try to get purl context = [0] if ident: name = ident.split('/')[-1] year = name.split('_')[-1] if year.isnumeric(): year = ceil(int(year) / 10) context = [year] # todo: author etc # create a graph for the linear sequence of elements at the given level: graph, start_node, end_node = page_get_linear_graph_at( level, pcgts) # apply language model to (TextEquiv path in) graph, # remove non-path TextEquivs, modify confidences: if not self.parameter['alternative_decoding']: text = [(edge['element'], edge['alternatives']) for edge in _get_edges(graph, 0)] # graph's path textstring = u''.join( textequivs[0].Unicode for element, textequivs in text) # same length as text LOG.info("Rating %d elements with a total of %d characters", len(text), len(textstring)) confidences = self.rater.rate(textstring, context) # much faster i = 0 for element, textequivs in text: textequiv = textequivs[0] # 1st choice only if element: element.set_TextEquiv([textequiv]) # delete others textequiv_len = len(textequiv.Unicode) conf = sum(confidences[i:i + textequiv_len] ) / textequiv_len # mean probability conf2 = textequiv.conf textequiv.set_conf(conf * lm_weight + conf2 * (1. - lm_weight)) i += textequiv_len if i != len(confidences): LOG.critical( "Input text length and output scores length are off by %d characters", i - len(confidences)) avg = sum(confidences) / len(confidences) ent = sum([-log(max(p, 1e-99), 2) for p in confidences]) / len(confidences) ppl = pow(2.0, ent) # character level ppll = pow( 2.0, ent * len(confidences) / len(text)) # textequiv level (including spaces/newlines) LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl, level, ppll) # character need not always equal glyph! # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, pcgts) # write back result file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) else: LOG.info("Rating %d elements including its alternatives", end_node - start_node) path, entropy, traceback = self.rater.rate_best( graph, start_node, end_node, start_traceback=prev_traceback, context=context, lm_weight=lm_weight, beam_width=beam_width, beam_clustering_dist=BEAM_CLUSTERING_DIST if BEAM_CLUSTERING_ENABLE else 0) if prev_pcgts: _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = prev_file_id.replace(self.input_file_grp, self.output_file_grp) if file_id == prev_file_id: file_id = concat_padded(self.output_file_grp, n - 1) self.workspace.add_file( ID=file_id, pageId=prev_page_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), ) prev_page_id = input_file.pageId prev_file_id = input_file.ID prev_pcgts = pcgts prev_traceback = traceback if prev_pcgts: path, entropy, _ = self.rater.next_path(prev_traceback[0], ([], prev_traceback[1])) _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), )