def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def test_page_from_file_no_existe(self): with self.assertRaisesRegex(FileNotFoundError, "File not found: 'no-existe'"): mets = OcrdMets.empty_mets() ocrd_file = mets.add_file('FOO', ID='foo', local_filename='no-existe', mimetype='foo/bar') page_from_file(ocrd_file)
def test_rename_file_group(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data')) as tempdir: workspace = Workspace(self.resolver, directory=tempdir) with pushd_popd(tempdir): pcgts_before = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))) assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif' # from os import system # print(system('find')) workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') # print(system('find')) pcgts_after = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))) assert pcgts_after.get_Page().imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif' assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists() assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): if not tf.test.is_gpu_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") sys.exit(1) model_path = Path(self.parameter['model_path']) class_mapper_path = Path(self.parameter['class_mapping_path']) if not Path(model_path).is_file(): LOG.error("""\ Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter points to the local model path. model can be downloaded from http://url """ % model_path) sys.exit(1) else: LOG.info('Loading model from file %s', model_path) model = self.create_model(str(model_path)) # load the mapping pickle_in = open(str(class_mapper_path), "rb") class_indices = pickle.load(pickle_in) label_mapping = dict((v, k) for k, v in class_indices.items()) # print("INPUT FILE HERE",self.input_files) for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename page_id = input_file.pageId or input_file.ID size = 600, 500 metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameter", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') img_array = ocrolib.pil2array( page_image.resize((500, 600), Image.ANTIALIAS)) img_array = img_array * 1. / 255. img_array = img_array[np.newaxis, :, :, np.newaxis] results = self.start_test(model, img_array, fname, label_mapping) LOG.info(results) self.workspace.mets.set_physical_page_for_file( "PHYS_000" + str(n), input_file) self.create_logmap_smlink(pcgts) self.write_to_mets(results, "PHYS_000" + str(n))
def zip_input_files(self, ifgs): ifts = list() # file tuples for page_id in self.workspace.mets.physical_pages: ifiles = list() for ifg in ifgs: LOG.debug("adding input file group %s to page %s", ifg, page_id) files = self.workspace.mets.find_files(pageId=page_id, fileGrp=ifg) if not files: # fall back for missing pageId via Page imageFilename: all_files = self.workspace.mets.find_files(fileGrp=ifg) for file_ in all_files: pcgts = page_from_file( self.workspace.download_file(file_)) image_url = pcgts.get_Page().get_imageFilename() img_files = self.workspace.mets.find_files( url=image_url) if img_files and img_files[0].pageId == page_id: files = [file_] break if not files: # other fallback options? LOG.error('found no page %s in file group %s', page_id, ifg) ifiles.append(None) else: ifiles.append(files[0]) if ifiles[0]: ifts.append(tuple(ifiles)) return ifts
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """Performs segmentation evaluation with Shapely on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Compare region polygons with each other. """ ifgs = self.input_file_grp.split(",") # input file groups if len(ifgs) < 2: raise Exception("need multiple input file groups to compare") # get input files: ifts = self._zip_input_files(ifgs) # input file tuples for ift in ifts: pages = [] for i, input_file in enumerate(ift): if not i: LOG.info("processing page %s", input_file.pageId) if not input_file: # file/page was not found in this group continue LOG.info("INPUT FILE for '%s': '%s'", ifgs[i], input_file.ID) pcgts = page_from_file(self.workspace.download_file(input_file)) pages.append(pcgts.get_Page()) gt_page = pages[0] for pred_page in pages[1:]: # self._compare_segmentation(gt_page, pred_page, input_file.pageId)
def validate(filename=None, ocrd_page=None, ocrd_file=None, strictness='strict', strategy='index1'): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage strictness (string): 'strict', 'lax', 'fix' or 'off' strategy (string): Currently only 'index1' Returns: report (:class:`ValidationReport`) Report on the validity """ if ocrd_page: validator = PageValidator(ocrd_page, strictness, strategy) elif ocrd_file: validator = PageValidator(page_from_file(ocrd_file), strictness, strategy) elif filename: validator = PageValidator(parse(filename, silence=True), strictness, strategy) else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") return validator._validate() # pylint: disable=protected-access
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """ Performs the binarization. """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to binarize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Binarizing line '%s' in region '%s'", line_no, region.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) print(dir(kraken.binarization)) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( self.output_file_grp, pageId=input_file.pageId, ID=ID, basename="%s.bin.png" % ID, mimetype='image/png', content=bin_image_bytes.getvalue())
def _resolve_image_file(self, input_file: OcrdFile) -> str: if input_file.mimetype == MIMETYPE_PAGE: pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() image_file = page.imageFilename else: image_file = input_file.local_filename return image_file
def test_page_from_file(self): f = create_ocrd_file_with_defaults(mimetype='image/tiff', local_filename=SAMPLE_IMG, ID='file1') self.assertEqual(f.mimetype, 'image/tiff') p = page_from_file(f) self.assertEqual(p.pcGtsId, f.ID) self.assertEqual(p.get_Page().imageWidth, 1457)
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): network_file = self.parameter['network'] stride = self.parameter['stride'] classifier = TypegroupsClassifier.load(network_file) ignore_type = ('Adornment', 'Book covers and other irrelevant data', 'Empty Pages', 'Woodcuts - Engravings') self.log.debug('Processing: %s', self.input_files) for (_, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename pil_image = self.workspace.resolve_image_as_pil(image_url) result = classifier.run(pil_image, stride) score_sum = 0 for typegroup in classifier.classMap.cl2id: if not typegroup in ignore_type: score_sum += max(0, result[typegroup]) script_highscore = 0 noise_highscore = 0 result_map = {} output = '' for typegroup in classifier.classMap.cl2id: score = result[typegroup] if typegroup in ignore_type: noise_highscore = max(noise_highscore, score) else: script_highscore = max(script_highscore, score) normalised_score = max(0, score / score_sum) result_map[normalised_score] = typegroup if noise_highscore > script_highscore: pcgts.get_Page().set_primaryScript(None) self.log.debug( 'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s', noise_highscore, script_highscore) else: for k in sorted(result_map, reverse=True): intk = round(100 * k) if intk <= 0: continue if output != '': output = '%s, ' % output output = '%s%s:%d' % (output, result_map[k], intk) self.log.debug('Detected %s' % output) page = pcgts.get_Page() textStyle = page.get_TextStyle() if not textStyle: textStyle = TextStyleType() page.set_TextStyle(textStyle) textStyle.set_fontFamily(output) ID = concat_padded(self.output_file_grp, input_file.ID) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): """ Performs the binarization. """ log = getLogger('processor.KrakenBinarize') log.debug('Level of operation: "%s"', self.parameter['level-of-operation']) log.debug('Input file group %s', self.input_file_grp) log.debug('Input files %s', [str(f) for f in self.input_files]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) if self.parameter['level-of-operation'] == 'page': log.info("About to binarize page '%s'", pcgts.pcGtsId) image = self.workspace.resolve_image_as_pil(image_url) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype='image/png', local_filename="%s/%s" % (self.output_file_grp, ID), content=bin_image_bytes.getvalue()) else: for region in pcgts.get_Page().get_TextRegion(): if self.parameter['level-of-operation'] == 'block': log.info("About to binarize region '%s'", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) else: textlines = region.get_TextLine() log.info("About to binarize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Binarizing line '%s' in region '%s'", line_no, region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(line.get_Coords().points)) bin_image = kraken.binarization.nlbin(image) bin_image_bytes = io.BytesIO() bin_image.save(bin_image_bytes, format='PNG') ID = concat_padded(self.output_file_grp, n, region.id, line_no) self.workspace.add_file( self.output_file_grp, pageId=input_file.pageId, ID=ID, local_filename="%s/%s" % (self.output_file_grp, ID), mimetype='image/png', content=bin_image_bytes.getvalue())
def test_page_from_file(self): f = OcrdFile(None, mimetype='image/tiff', local_filename=SAMPLE_IMG, ID='file1') self.assertEqual(f.mimetype, 'image/tiff') p = page_from_file(f) self.assertEqual(p.pcGtsId, f.ID) self.assertEqual(p.get_Page().imageWidth, 1457)
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) linesdir = self.parameter['linesdir'] # self.log.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): # self.log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) pil_image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) self.log.info("Preparing page '%s'", pcgts.get_pcGtsId()) page = pcgts.get_Page() # region, line, word, or glyph level: regions = page.get_TextRegion() if not regions: self.log.warning("Page contains no text regions") for region in regions: self.log.info("Preparing region '%s'", region.id) textlines = region.get_TextLine() if not textlines: self.log.warning("Region '%s' contains no text lines", region.id) else: for line in textlines: self.log.info("Cutting line '%s'", line.id) # get box from points box = bounding_box(line.get_Coords().points) # crop word from page croped_image = pil_image.crop(box=box) # binarize with Otsu's thresholding after Gaussian filtering bin_image = binarize(croped_image) # resize image to 48 pixel height final_img = resize_keep_ratio(bin_image) index = input_file.url.rfind('/') fgrp = input_file.url[index:-4] # save temp image suffix = fgrp + '-' + str(region.id) + '-' + str( line.id) + '.png' imgpath = linesdir + suffix if not os.path.exists(linesdir): os.makedirs(linesdir) final_img.save(imgpath)
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) self.maxlevel = self.parameter['textequiv_level'] linesdir = self.parameter['linesdir'] if self.maxlevel not in ['line', 'word', 'glyph']: raise Exception( "currently only implemented at the line/glyph level") root, _, files = os.walk(linesdir).__next__() self.root = root predfiles = [] for file in files: if '.pred' in file: predfiles.append(file[:-9]) ######################################################################################## # self.log.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): # self.log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId()) page = pcgts.get_Page() index = input_file.url.rfind('/') + 1 fgrp = input_file.url[index:-4] # region, line, word, or glyph level: regions = page.get_TextRegion() if not regions: self.log.warning("Page contains no text regions") self.process_regions(regions, predfiles, fgrp) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def process(self): """Segment pages into regions using a Mask R-CNN model.""" assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('processor.AnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning( "Tensorflow cannot detect CUDA installation. Running without GPU will be slow." ) for input_file in self.input_files: pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID # todo rs: why not cropped? page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: # todo rs: this combination only works for tiseg with use_deeplr=true mask_image, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image, dpi) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if oplevel == "page": self._process_segment(page, page_image.filename, page_id, file_id + ".ds") file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) # image_coords = pcgts.get_Page().get_Border().get_Coords().points.split() # why does it return Image type when there is data (border info from crop) print("----------", type(page_image), page_xywh) # I: binarized-input-image; imftext: output-text-portion.png; imfimage: output-image-portion.png '''
def test_rename_file_group(tmp_path): # arrange copytree( assets.path_to( 'kant_aufklaerung_1784-page-region-line-word_glyph/data'), str(tmp_path)) workspace = Workspace(Resolver(), directory=str(tmp_path)) # before act # TODO clear semantics # requires rather odd additional path-setting because root path from # workspace is not propagated - works only if called inside workspace # which can be achieved with pushd_popd functionalities ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) relative_name = ocrd_file.local_filename ocrd_file.local_filename = join(tmp_path, relative_name) pcgts_before = page_from_file(ocrd_file) # before assert assert pcgts_before.get_Page( ).imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif' # act workspace.rename_file_group('OCR-D-IMG', 'FOOBAR') next_ocrd_file = next( workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')) next_ocrd_file.local_filename = join(tmp_path, relative_name) pcgts_after = page_from_file(next_ocrd_file) # assert assert pcgts_after.get_Page().imageFilename == 'FOOBAR/FOOBAR_0001.tif' assert Path(tmp_path / 'FOOBAR/FOOBAR_0001.tif').exists() assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists() assert workspace.mets.get_physical_pages( for_fileIds=['OCR-D-IMG_0001']) == [None] assert workspace.mets.get_physical_pages(for_fileIds=['FOOBAR_0001']) == [ 'phys_0001' ]
def validate(filename=None, ocrd_page=None, ocrd_file=None, page_textequiv_consistency='strict', page_textequiv_strategy='first', check_baseline=True, check_coords=True): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' page_textequiv_strategy (string): Currently only 'first' check_baseline (bool): whether Baseline must be fully within TextLine/Coords check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully contained within Border/*Region/TextLine/Word, resp. Returns: report (:class:`ValidationReport`) Report on the validity """ log = getLogger('ocrd.page_validator.validate') if ocrd_page: page = ocrd_page file_id = ocrd_page.get_pcGtsId() elif ocrd_file: page = page_from_file(ocrd_file) file_id = ocrd_file.ID elif filename: page = parse(filename, silence=True) file_id = filename else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): raise Exception( "page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) return report
def _validate_dimension(self): """ Validate image height and PAGE imageHeight match """ self.log.info('_validate_dimension') for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): if not is_local_filename(f.url) and not self.download: self.report.add_notice("_validate_dimension: Not executed because --download wasn't set and PAGE might reference remote (Alternative)Images <%s>" % f.url) continue page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height)) if page.imageWidth != exif.width: self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
def _validate_page(self): """ Run PageValidator on the PAGE-XML documents referenced in the METS. """ self.log.debug('_validate_page') for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE): self.workspace.download_file(ocrd_file) page_report = PageValidator.validate(ocrd_file=ocrd_file, page_textequiv_consistency=self.page_strictness, check_coords=self.page_coordinate_consistency in ['poly', 'both'], check_baseline=self.page_coordinate_consistency in ['baseline', 'both']) pg = page_from_file(ocrd_file) if pg.pcGtsId != ocrd_file.ID: page_report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pg.pcGtsId or '', ocrd_file.ID or '')) self.report.merge_report(page_report)
def process(self): """ Performs the recognition. """ self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" line_text = prediction.sentence line_conf = prediction.avg_char_probability line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) _page_update_higher_textequiv_levels('line', pcgts) file_id = self._make_file_id(input_file, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: input_file = self.workspace.download_file(input_file) file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) else: # Source file is not PAGE-XML: Copy byte-by-byte with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) if input_file.mimetype.startswith('image/'): # write out the PAGE-XML representation for this image page_file_id = file_id + '_PAGE' pcgts.set_pcGtsId(page_file_id) pcgts.get_Page().set_imageFilename(local_filename) page_filename = join(self.output_file_grp, file_id + '.xml') LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8'))
def process(self): LOG = getLogger('OcrdAnybaseocrTextline') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed') if oplevel == 'page': LOG.warning("Operation level should be region.") self._process_segment(page_image, page, None, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) continue for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) self._process_segment(region_image, page, region, region_xywh, region.id, input_file, k) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))