def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ input_path = storage.get_abs_path(*doc[1]) output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] + '.xml') logger.debug('Searching for model {}'.format(model)) if model in nidaba_cfg['kraken_models']: model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model])) elif model in nidaba_cfg['ocropus_models']: model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model])) else: raise NidabaInvalidParameterException('Model not defined in ' 'configuration') img = Image.open(input_path) logger.debug('Reading TEI segmentation from {}'.format(doc[1])) tei = OCRRecord() with storage.StorageFile(*doc[0]) as seg: tei.load_tei(seg) logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines logger.debug('Loading model {}'.format(model)) rnn = models.load_any(model) i = 0 logger.debug('Start recognizing characters') for line_id, rec in zip(lines, rpred.rpred(rnn, img, [x['bbox'] for x in lines.itervalues()])): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)])) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path
def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ output_path = ( doc[0], os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] + '.xml') logger.debug('Loading model {}'.format(model)) try: rnn = models.load_any(mod_db[model]) except Exception as e: raise NidabaInvalidParameterException(str(e)) logger.debug('Reading TEI segmentation from {}'.format(doc)) tei = OCRRecord() with storage.StorageFile(*doc) as seg: tei.load_tei(seg) img = Image.open( storage.get_abs_path(*storage.get_storage_path_url(tei.img))) if is_bitonal(img): img = img.convert('1') else: raise NidabaInvalidParameterException('Input image is not bitonal') logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines i = 0 rnn = models.load_any(mod_db[model]) logger.debug('Start recognizing characters') for line_id, rec in izip( lines, rpred.rpred( rnn, img, { 'text_direction': 'horizontal-tb', 'boxes': [list(x['bbox']) for x in lines.itervalues()] })): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug( 'Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format( rec.prediction[line_offset:line_offset + len(segment)])) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format( rec.prediction[line_offset:line_offset + len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path
def ocr(image_path, segmentation_path, output_path, model_path): """ Scan a single image with ocropus. Reads a single image file from ```imagepath``` and writes the recognized text as a TEI document into output_path. Args: image_path (unicode): Path of the input file segmentation_path (unicode): Path of the segmentation XML file. output_path (unicode): Path of the output file model_path (unicode): Path of the recognition model. Must be a pyrnn.gz pickle dump interoperable with ocropus-rpred. Returns: (unicode): A string of the output file that is actually written. As Ocropus rewrites output file paths without notice it may be different from the ```outputfilepath``` argument. Raises: NidabaOcropusException: Ocropus somehow failed. The error output is contained in the message but as it is de facto unusable as a library it's impossible to deduct the nature of the problem. """ try: logger.debug('Loading pyrnn from {}'.format(model_path)) network = ocrolib.load_object(model_path, verbose=0) lnorm = getattr(network, "lnorm") except Exception as e: raise NidabaOcropusException('Something somewhere broke: ' + e.msg) im = Image.open(image_path) logger.debug('Loading TEI segmentation {}'.format(segmentation_path)) tei = OCRRecord() with open(segmentation_path, 'r') as seg_fp: tei.load_tei(seg_fp) logger.debug('Clearing out word/grapheme boxes') # ocropus is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('ocropus', 'character recognition') for line_id, box in tei.lines.iteritems(): logger.debug('Recognizing line {}'.format(box['bbox'])) line = ocrolib.pil2array(im.crop(box['bbox'])) temp = np.amax(line) - line temp = temp * 1.0 / np.amax(temp) lnorm.measure(temp) line = lnorm.normalize(line, cval=np.amax(line)) if line.ndim == 3: np.mean(line, 2) line = ocrolib.lstm.prepare_line(line, 16) pred = network.predictString(line) pred = ocrolib.normalize_text(pred) logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) logger.debug('Adding graphemes: {}'.format(pred)) tei.add_graphemes({'grapheme': x} for x in pred) with open(output_path, 'wb') as fp: logger.debug('Writing TEI to {}'.format(fp.name)) tei.write_tei(fp) return output_path