Пример #1
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1],
                                                                     method,
                                                                     model))[0]
                   + '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = OCRRecord()
    with storage.StorageFile(*doc[0]) as seg:
        tei.load_tei(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for line_id, rec in zip(lines, rpred.rpred(rnn, img, [x['bbox'] for x in lines.itervalues()])):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)])
                logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)]))
                tei.add_graphemes([{'grapheme': x[0], 
                                    'bbox': x[1],
                                    'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([{'grapheme': x[0], 
                                    'bbox': x[1],
                                    'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return output_path
Пример #2
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    output_path = (
        doc[0],
        os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] +
        '.xml')
    logger.debug('Loading model {}'.format(model))
    try:
        rnn = models.load_any(mod_db[model])
    except Exception as e:
        raise NidabaInvalidParameterException(str(e))
    logger.debug('Reading TEI segmentation from {}'.format(doc))
    tei = OCRRecord()
    with storage.StorageFile(*doc) as seg:
        tei.load_tei(seg)

    img = Image.open(
        storage.get_abs_path(*storage.get_storage_path_url(tei.img)))
    if is_bitonal(img):
        img = img.convert('1')
    else:
        raise NidabaInvalidParameterException('Input image is not bitonal')

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    i = 0
    rnn = models.load_any(mod_db[model])
    logger.debug('Start recognizing characters')
    for line_id, rec in izip(
            lines,
            rpred.rpred(
                rnn, img, {
                    'text_direction': 'horizontal-tb',
                    'boxes': [list(x['bbox']) for x in lines.itervalues()]
                })):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return output_path
Пример #3
0
def ocr(image_path, segmentation_path, output_path, model_path):
    """
    Scan a single image with ocropus.

    Reads a single image file from ```imagepath``` and writes the recognized
    text as a TEI document into output_path.

    Args:
        image_path (unicode): Path of the input file
        segmentation_path (unicode): Path of the segmentation XML file.
        output_path (unicode): Path of the output file
        model_path (unicode): Path of the recognition model. Must be a pyrnn.gz
                             pickle dump interoperable with ocropus-rpred.

    Returns:
        (unicode): A string of the output file that is actually written. As
                   Ocropus rewrites output file paths without notice it may be
                   different from the ```outputfilepath``` argument.

    Raises:
        NidabaOcropusException: Ocropus somehow failed. The error output is
                                contained in the message but as it is de facto
                                unusable as a library it's impossible to deduct
                                the nature of the problem.
    """

    try:
        logger.debug('Loading pyrnn from {}'.format(model_path))
        network = ocrolib.load_object(model_path, verbose=0)
        lnorm = getattr(network, "lnorm")
    except Exception as e:
        raise NidabaOcropusException('Something somewhere broke: ' + e.msg)
    im = Image.open(image_path)

    logger.debug('Loading TEI segmentation {}'.format(segmentation_path))
    tei = OCRRecord()
    with open(segmentation_path, 'r') as seg_fp:
        tei.load_tei(seg_fp)

    logger.debug('Clearing out word/grapheme boxes')
    # ocropus is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('ocropus', 'character recognition')
    for line_id, box in tei.lines.iteritems():
        logger.debug('Recognizing line {}'.format(box['bbox']))
        line = ocrolib.pil2array(im.crop(box['bbox']))
        temp = np.amax(line) - line
        temp = temp * 1.0 / np.amax(temp)
        lnorm.measure(temp)
        line = lnorm.normalize(line, cval=np.amax(line))
        if line.ndim == 3:
            np.mean(line, 2)
        line = ocrolib.lstm.prepare_line(line, 16)
        pred = network.predictString(line)
        pred = ocrolib.normalize_text(pred)
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        logger.debug('Adding graphemes: {}'.format(pred))
        tei.add_graphemes({'grapheme': x} for x in pred)
    with open(output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.name))
        tei.write_tei(fp)
    return output_path