示例#1
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
示例#2
0
def setup(*args, **kwargs):
    try:
        global binarization
        global pageseg
        global rpred
        global models
        global mod_db
        from kraken import binarization
        from kraken import pageseg
        from kraken import rpred
        from kraken.lib import models
        # pronn/clstm models get prioritized over pyrnn ones
        mod_db = {k: storage.get_abs_path(*v) for k, v in nidaba_cfg['ocropus_models'].iteritems()}
        if kwargs.get('modeldata'):
            md = kwargs.get('modeldata')
            if isinstance(md, list):
                md = storage.get_abs_path(md)
            for model in glob.glob(md + '/*/*/DESCRIPTION'):
                with open(model) as fp:
                    meta = json.load(fp)
                    mod_db[model.split('/')[-2]] = os.path.join(os.path.dirname(model), meta['name'])
        ocr_kraken.arg_values['model'] = mod_db.keys()

    except ImportError as e:
        raise NidabaPluginException(e.message)
示例#3
0
def setup(*args, **kwargs):
    try:
        global binarization
        global pageseg
        global rpred
        global models
        global mod_db
        from kraken import binarization
        from kraken import pageseg
        from kraken import rpred
        from kraken.lib import models
        # pronn/clstm models get prioritized over pyrnn ones
        mod_db = {
            k: storage.get_abs_path(*v)
            for k, v in nidaba_cfg['ocropus_models'].iteritems()
        }
        if kwargs.get('modeldata'):
            md = kwargs.get('modeldata')
            if isinstance(md, list):
                md = storage.get_abs_path(md)
            for model in glob.glob(md + '/*/*/DESCRIPTION'):
                with open(model) as fp:
                    meta = json.load(fp)
                    mod_db[model.split('/')[-2]] = os.path.join(
                        os.path.dirname(model), meta['name'])
        ocr_kraken.arg_values['model'] = mod_db.keys()

    except ImportError as e:
        raise NidabaPluginException(e.message)
示例#4
0
文件: kraken.py 项目: amitdo/nidaba
def nlbin(doc, method=u'nlbin', threshold=0.5, zoom=0.5, escale=1.0,
          border=0.1, perc=80, range=20, low=5, high=90):
    """
    Binarizes an input document utilizing ocropus'/kraken's nlbin algorithm.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.
        threshold (float):
        zoom (float):
        escale (float):
        border (float)
        perc (int):
        range (int):
        low (int):
        high (int):

    Returns:
        (unicode, unicode): Storage tuple of the output file

    Raises:
        NidabaInvalidParameterException: Input parameters are outside the valid
                                         range.

    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, unicode(threshold),
                                        unicode(zoom), unicode(escale),
                                        unicode(border), unicode(perc),
                                        unicode(range), unicode(low),
                                        unicode(high))
    kraken_nlbin(input_path, output_path, threshold, zoom, escale, border,
                 perc, range, low, high)
    return storage.get_storage_path(output_path)
示例#5
0
文件: cli.py 项目: kursataker/nidaba
def status(args):
    """
    Implements the status subcommand.

    Args:
        args (argparse.Namespace): Parsed input object from argparse
    """

    batch = Batch(args.jobid)
    state = batch.get_state()
    print(state)
    if state == 'SUCCESS':
        ret = batch.get_results()
        if ret is None:
            print('Something somewhere went wrong.')
            print('Please contact your friendly nidaba support technician.')
        else:
            for doc in ret:
                print('\t' + storage.get_abs_path(*doc).encode('utf-8'))
    elif state == 'FAILURE':
        ret = batch.get_errors()
        if ret is None:
            print('Something somewhere went wrong.')
        else:
            for fun in ret:
                print(fun[0]['method'].encode('utf-8'), 
                      'failed while operating on',
                      fun[0]['doc'][1].encode('utf-8'), 
                      'which is based on',
                      fun[1]['root'][1].encode('utf-8'))
示例#6
0
def otsu(doc, method=u'otsu', thresh=100, mincount=50, bgval=255,
         smoothx=2, smoothy=2):
    """
    Binarizes an input document utilizing a naive implementation of Otsu's
    thresholding.

    Args:
        doc (unicode, unicode): The input document tuple.
        id (unicode): The nidaba batch identifier this task is a part of
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file

    Raises:
        NidabaInvalidParameterException: Input parameters are outside the valid
                                         range.

    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, unicode(thresh),
                                        unicode(mincount), unicode(bgval),
                                        unicode(smoothx), unicode(smoothy))
    if smoothx < 0 or smoothy < 0 or bgval < 0 or thresh < 0 or mincount < 0:
        raise NidabaInvalidParameterException('Parameters (' + unicode(thresh)
                                              + ',' + unicode(mincount) + ',' +
                                              unicode(bgval) + ',' +
                                              unicode(smoothx) + ',' +
                                              unicode(smoothy) + ',' +
                                              ') outside of valid range')
    return storage.get_storage_path(leper.otsu_binarize(input_path,
                                                        output_path, thresh,
                                                        mincount, bgval,
                                                        smoothx, smoothy))
示例#7
0
def sauvola(doc, method=u'sauvola', whsize=10, factor=0.35):
    """
    Binarizes an input document utilizing Sauvola thresholding as described in
    [0]. Expects 8bpp grayscale images as input.

    [0] Sauvola, Jaakko, and Matti Pietikäinen. "Adaptive document image
    binarization." Pattern recognition 33.2 (2000): 225-236.

    Args:
        doc (unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files
        whsize (int): The window width and height that local statistics are
                      calculated on are twice the value of whsize. The minimal
                      value is 2.
        factor (float): The threshold reduction factor due to variance. 0 =<
                        factor < 1.

    Returns:
        (unicode, unicode): Storage tuple of the output file

    Raises:
        NidabaInvalidParameterException: Input parameters are outside the valid
                                         range.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, unicode(whsize),
                                        unicode(factor))
    lept_sauvola(input_path, output_path, whsize, factor)
    return storage.get_storage_path(output_path)
示例#8
0
def sauvola(doc, method=u'sauvola', whsize=10, factor=0.35):
    """
    Binarizes an input document utilizing Sauvola thresholding as described in
    [0]. Expects 8bpp grayscale images as input.

    [0] Sauvola, Jaakko, and Matti Pietikäinen. "Adaptive document image
    binarization." Pattern recognition 33.2 (2000): 225-236.

    Args:
        doc (unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files
        whsize (int): The window width and height that local statistics are
                      calculated on are twice the value of whsize. The minimal
                      value is 2.
        factor (float): The threshold reduction factor due to variance. 0 =<
                        factor < 1.

    Returns:
        (unicode, unicode): Storage tuple of the output file

    Raises:
        NidabaInvalidParameterException: Input parameters are outside the valid
                                         range.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, unicode(whsize),
                                        unicode(factor))
    lept_sauvola(input_path, output_path, whsize, factor)
    return storage.get_storage_path(output_path)
示例#9
0
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None,
                  extended=False):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        languages (list): A list of tesseract classifier identifiers
        extended (bool): Switch to enable extended hOCR generation containing
                         character cuts and confidences. Has no effect when
                         direct or legacy implementation is used.

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    image_path = storage.get_abs_path(*doc[1])

    # rewrite the segmentation file to lines in UZN format
    logger.debug('Rewriting TEI ({}) -> UZN ({})'.format(doc[0][1],
                                                         splitext(doc[1][1])[0]
                                                         + '.uzn'))
    seg = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as fp:
        seg.read(fp)
    with storage.StorageFile(doc[1][0], splitext(doc[1][1])[0] + '.uzn', mode='wb') as fp:
        uzn = UZNWriter(fp)
        for line in seg.lines:
            uzn.writerow(*line[:4])

    if isinstance(languages, basestring):
        languages = [languages]
    output_path = storage.insert_suffix(image_path, method, *languages)

    logger.debug('Invoking tesseract with {} call method'.format(implementation))
    if implementation == 'legacy':
        result_path = output_path + '.html'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'direct':
        result_path = output_path + '.hocr'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'capi':
        result_path = output_path + '.xml'
        ocr_capi(image_path, result_path, seg, languages, extended)
    else:
        raise NidabaTesseractException('Invalid implementation selected',
                                       implementation)

    if not result_path[-4:] == '.xml':
        logger.debug('Converting hOCR ({}) -> TEI ({})'.format(result_path,
                                                               output_path +
                                                               '.xml'))
        tei = TEIFacsimile()
        with open(result_path) as fp:
            tei.load_hocr(fp)
        os.unlink(result_path)
        with open(output_path + '.xml', 'wb') as fp:
            tei.write(fp)
        result_path = output_path + '.xml'
    return storage.get_storage_path(result_path)
示例#10
0
def any_to_png(doc, method=u'any_to_png'):
    """
    Converts an image (color or otherwise) in any format recognized by pillow
    to PNG.

    The pillow image library relies on external libraries for loading and
    saving Image data. To recognize the most common image formats used for
    digital archival you'll need:

    - libtiff
    - zlib
    - libjpeg
    - openjpeg (version 2.0 +)
    - libwebp

    To have access to all formats run (on Debian/Ubuntu):

    .. code-block:: console

        # apt-get -y install libtiff5-dev libjpeg62-turbo-dev zlib1g-dev \
            libwebp-dev libopenjp2-dev

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = os.path.splitext(storage.insert_suffix(input_path, method))[0] + '.png'
    return storage.get_storage_path(image.any_to_png(input_path, output_path))
示例#11
0
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """ 
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(
        storage.insert_suffix(input_path, method))
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(
            doc[1], *img.size))
        tei = OCRRecord()
        tei.img = storage.get_url(*doc)
        tei.dimensions = img.size
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']:
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.name))
        tei.write_tei(fp)
    return storage.get_storage_path(output_path + '.xml')
示例#12
0
def any_to_png(doc, method=u'any_to_png'):
    """
    Converts an image (color or otherwise) in any format recognized by pillow
    to PNG.

    The pillow image library relies on external libraries for loading and
    saving Image data. To recognize the most common image formats used for
    digital archival you'll need:

    - libtiff
    - zlib
    - libjpeg
    - openjpeg (version 2.0 +)
    - libwebp

    To have access to all formats run (on Debian/Ubuntu):

    .. code-block:: console

        # apt-get -y install libtiff5-dev libjpeg62-turbo-dev zlib1g-dev \
            libwebp-dev libopenjp2-dev

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = os.path.splitext(storage.insert_suffix(input_path,
                                                         method))[0] + '.png'
    return storage.get_storage_path(image.any_to_png(input_path, output_path))
示例#13
0
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """ 
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(storage.insert_suffix(input_path,
                                        method))
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size))
        tei = OCRRecord()
        tei.img = storage.get_url(*doc)
        tei.dimensions = img.size
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']:
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.name))
        tei.write_tei(fp)
    return storage.get_storage_path(output_path + '.xml')
示例#14
0
def ocr_tesseract(doc,
                  method=u'ocr_tesseract',
                  languages=None,
                  extended=False):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        languages (list): A list of tesseract classifier identifiers
        extended (bool): Switch to enable extended hOCR generation containing
                         character cuts and confidences. Has no effect when
                         direct or legacy implementation is used.

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    seg = OCRRecord()
    with storage.StorageFile(*doc) as fp:
        seg.load_tei(fp)
    with storage.StorageFile(doc[0], splitext(doc[1])[0] + '.uzn',
                             mode='wb') as fp:
        uzn = UZNWriter(fp)
        for line in seg.lines.itervalues():
            uzn.writerow(*line['bbox'])

    image_path = storage.get_abs_path(*storage.get_storage_path_url(seg.img))
    if isinstance(languages, basestring):
        languages = [languages]
    output_path = storage.insert_suffix(image_path, method, *languages)

    logger.debug(
        'Invoking tesseract with {} call method'.format(implementation))
    if implementation == 'legacy':
        result_path = output_path + '.html'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'direct':
        result_path = output_path + '.hocr'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'capi':
        result_path = output_path + '.xml'
        ocr_capi(image_path, result_path, seg, languages, extended)
    else:
        raise NidabaTesseractException('Invalid implementation selected',
                                       implementation)

    if not result_path[-4:] == '.xml':
        logger.debug('Converting hOCR ({}) -> TEI ({})'.format(
            result_path, output_path + '.xml'))
        tei = OCRRecord()
        with open(result_path) as fp:
            tei.load_hocr(fp)
        os.unlink(result_path)
        with open(output_path + '.xml', 'wb') as fp:
            tei.write_tei(fp)
        result_path = output_path + '.xml'
    return storage.get_storage_path(result_path)
示例#15
0
文件: cli.py 项目: kursataker/nidaba
def batch(args):
    """
    Implements the batch subcommand of the nidaba binary.

    Args:
        args (argparse.Namespace): Parsed input object from argparse
    """

    id = unicode(uuid.uuid4())
    batch = Batch(id)
    print('Preparing filestore....', end=''),
    if storage.prepare_filestore(id) is None:
        print('failed.')
        exit()
    for doc in args.files:
        shutil.copy2(doc, storage.get_abs_path(id, os.path.basename(doc)))
        batch.add_document((id, os.path.basename(doc)))
    print('done.')
    print('Building batch...', end='')

    batch.add_step()
    if not args.grayscale:
        batch.add_tick()
        batch.add_task('img.rgb_to_gray')
    if args.binarize:
        batch.add_tick()
        for bin in args.binarize:
            (alg, _, params) = bin.partition(u':')
            for c in params.split(u';'):
                kwargs = dict(kwarg.split('=') for kwarg in c.split(",") if len(kwarg.split('=')) == 2)
                print(kwargs)
                kwargs = {key: int_float_or_str(val)
                          for key, val in kwargs.items()}
                batch.add_task('binarize.' + alg, **kwargs)
    if args.ocr:
        batch.add_tick()
        for ocr in args.ocr:
            (engine, _, params) = ocr.partition(u':')
            if engine == u'tesseract':
                batch.add_task('ocr.tesseract', languages=params.split(u','))
            elif engine == u'ocropus':
                for model in params.split(u','):
                    if model not in nidaba_cfg['ocropus_models']:
                        print('WARNING: ocropus model ' +
                              model.encode('utf-8') + ' not known.')
                    else:
                        batch.add_task('ocr.ocropus', model=model)
            else:
                print('WARNING: OCR engine ' + engine.encode('utf-8') + ' not\
                      known.')
    if args.willitblend:
        batch.add_step()
        batch.add_tick()
        batch.add_task('util.blend_hocr')
    batch.run()
    print('done.')
    print(id)
示例#16
0
文件: ocropus.py 项目: mirskiy/nidaba
def ocr_ocropus(doc, method=u'ocr_ocropus', model=None):
    """
    Runs ocropus on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file

    """
    image_path = storage.get_abs_path(*doc[1])
    segmentation_path = storage.get_abs_path(*doc[0])
    output_path = os.path.splitext(
        storage.insert_suffix(image_path, method, model))[0] + '.xml'
    model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    return storage.get_storage_path(
        ocr(image_path, segmentation_path, output_path, model))
示例#17
0
文件: ocr.py 项目: kursataker/nidaba
def ocr_ocropus(doc, method=u'ocr_ocropus', model=None):
    """
    Runs ocropus on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        id (unicode): The nidaba batch identifier this task is a part of
        method (unicode): The suffix string appended to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file

    """
    input_path = storage.get_abs_path(*doc)
    output_path = os.path.splitext(storage.insert_suffix(input_path, method,
                                                         model))[0] + '.html'
    model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    return storage.get_storage_path(ocropus.ocr(input_path, output_path,
                                                model))
示例#18
0
def spell_check(doc,
                method=u'spell_check',
                language=u'',
                filter_punctuation=False):
    """
    Adds spelling suggestions to an TEI XML document.

    Alternative spellings for each segment will be included in a choice
    tagcontaining a series of corr tags with the original segment appearing
    beneath a sic element.  Correct words, i.e. words appearing verbatim in the
    dictionary, are left untouched.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to the output file.
        language (unicode): Identifier defined in the nidaba configuration as a
                            valid dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   ``seg``
    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, language,
                                        unicode(filter_punctuation))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    del_dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['deletion_dictionary'])
    with storage.StorageFile(*doc) as fp:
        logger.debug('Reading TEI ({})'.format(fp.abs_path))
        tei = TEIFacsimile()
        tei.read(fp)
        logger.debug('Performing spell check')
        ret = lex.tei_spellcheck(tei, dictionary, del_dictionary,
                                 filter_punctuation)
    with storage.StorageFile(*storage.get_storage_path(output_path),
                             mode='wb') as fp:
        logger.debug('Writing TEI ({})'.format(fp.abs_path))
        ret.write(fp)
    return storage.get_storage_path(output_path)
示例#19
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme']
                                   for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(
                tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {
            'edit_ratio': err_cnt / float(cnt),
            'ground_truth': '',
            'doc': doc
        }
示例#20
0
文件: cli.py 项目: ryanfb/nidaba
 def do_move(batch, src):
     if isinstance(batch, NetworkSimpleBatch):
         dst = os.path.basename(src)
         def callback(monitor):
             spin(u'Uploading {}'.format(dst))
         batch.add_document(src, callback, auxiliary=True)
         click.secho(u'\b\u2713', fg='green', nl=False)
         click.echo('\033[?25h\n', nl=False)
     else:
         from nidaba import storage
         suffix = uuid.uuid4()
         dst = os.path.basename(src) + '_' + unicode(suffix)
         shutil.copy2(src, storage.get_abs_path(batch.id, dst))
     return (batch.id, dst)
示例#21
0
def deskew(doc, method=u'deskew'):
    """
    Removes skew (rotational distortion) from an 1bpp input image.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    lept_deskew(input_path, output_path)
    return storage.get_storage_path(output_path)
示例#22
0
文件: img.py 项目: kursataker/nidaba
def deskew(doc, method=u'deskew'):
    """
    Removes skew (rotational distortion) from an 1bpp input image.

    Args:
        doc (unicode, unicode): The input document tuple.
        id (unicode): The nidaba batch identifier this task is a part of
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    return storage.get_storage_path(leper.deskew(input_path, output_path))
示例#23
0
def deskew(doc, method=u'deskew'):
    """
    Removes skew (rotational distortion) from an 1bpp input image.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    lept_deskew(input_path, output_path)
    return storage.get_storage_path(output_path)
示例#24
0
def rgb_to_gray(doc, method=u'rgb_to_gray'):
    """
    Converts an arbitrary bit depth image to grayscale and writes it back
    appending a suffix.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    return storage.get_storage_path(image.rgb_to_gray(input_path, output_path))
示例#25
0
def rgb_to_gray(doc, method=u'rgb_to_gray'):
    """
    Converts an arbitrary bit depth image to grayscale and writes it back
    appending a suffix.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    return storage.get_storage_path(image.rgb_to_gray(input_path, output_path))
示例#26
0
def spell_check(doc, method=u'spell_check', language=u'',
                filter_punctuation=False):
    """
    Adds spelling suggestions to an TEI XML document.

    Alternative spellings for each segment will be included in a choice
    tagcontaining a series of corr tags with the original segment appearing
    beneath a sic element.  Correct words, i.e. words appearing verbatim in the
    dictionary, are left untouched.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to the output file.
        language (unicode): Identifier defined in the nidaba configuration as a
                            valid dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   ``seg``
    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, language,
                                        unicode(filter_punctuation))
    dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary'])
    del_dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['deletion_dictionary'])
    with storage.StorageFile(*doc) as fp:
        logger.debug('Reading TEI ({})'.format(fp.abs_path))
        tei = OCRRecord()
        tei.load_tei(fp)
        logger.debug('Performing spell check')
        ret = lex.tei_spellcheck(tei, dictionary, del_dictionary,
                                 filter_punctuation)
    with storage.StorageFile(*storage.get_storage_path(output_path), mode='wb') as fp:
        logger.debug('Writing TEI ({})'.format(fp.abs_path))
        ret.write_tei(fp)
    return storage.get_storage_path(output_path)
示例#27
0
def dewarp(doc, method=u'dewarp'):
    """
    Removes perspective distortion (as commonly exhibited by overhead scans)
    from an 1bpp input image.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    lept_dewarp(input_path, output_path)
    return storage.get_storage_path(output_path)
示例#28
0
    def do_move(batch, src):
        if isinstance(batch, NetworkSimpleBatch):
            dst = os.path.basename(src)

            def callback(monitor):
                spin(u'Uploading {}'.format(dst))

            batch.add_document(src, callback, auxiliary=True)
            click.secho(u'\b\u2713', fg='green', nl=False)
            click.echo('\033[?25h\n', nl=False)
        else:
            from nidaba import storage
            suffix = uuid.uuid4()
            dst = os.path.basename(src) + '_' + unicode(suffix)
            shutil.copy2(src, storage.get_abs_path(batch.id, dst))
        return (batch.id, dst)
示例#29
0
def dewarp(doc, method=u'dewarp'):
    """
    Removes perspective distortion (as commonly exhibited by overhead scans)
    from an 1bpp input image.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    lept_dewarp(input_path, output_path)
    return storage.get_storage_path(output_path)
示例#30
0
def otsu(doc, method=u'otsu'):
    """
    Binarizes an input document utilizing a naive implementation of Otsu's
    thresholding.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.

    Returns:
        (unicode, unicode): Storage tuple of the output file

    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method)
    return storage.get_storage_path(image.otsu(input_path, output_path))
示例#31
0
文件: ocr.py 项目: kursataker/nidaba
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        id (unicode): The nidaba batch identifier this task is a part of
        method (unicode): The suffix string appended to all output files
                          languages (list of unicode): A list of languages for
                          the tesseract language model

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, *languages)
    return storage.get_storage_path(tesseract.ocr(input_path, output_path,
                                                  languages))
示例#32
0
def nlbin(doc,
          method=u'nlbin',
          threshold=0.5,
          zoom=0.5,
          escale=1.0,
          border=0.1,
          perc=80,
          range=20,
          low=5,
          high=90):
    """
    Binarizes an input document utilizing ocropus'/kraken's nlbin algorithm.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to all output files.
        threshold (float):
        zoom (float):
        escale (float):
        border (float)
        perc (int):
        range (int):
        low (int):
        high (int):

    Returns:
        (unicode, unicode): Storage tuple of the output file

    Raises:
        NidabaInvalidParameterException: Input parameters are outside the valid
                                         range.

    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, unicode(threshold),
                                        unicode(zoom), unicode(escale),
                                        unicode(border), unicode(perc),
                                        unicode(range), unicode(low),
                                        unicode(high))
    img = Image.open(input_path)
    o_img = binarization.nlbin(img, threshold, zoom, escale, border, perc,
                               range, low, high)
    o_img.save(output_path)
    return storage.get_storage_path(output_path)
示例#33
0
文件: util.py 项目: kursataker/nidaba
def blend_hocr(docs, method=u'blend_hocr', language=u''):
    """
    Blends multiple hOCR files using the algorithm from Bruce Robertsons
    rigaudon. It requires a working spell checking for the input document's
    language; otherwise all matched bboxes will be bunched together without any
    scoring.

    Args:
        docs [(id, path), ...]: A list of storage module tupels that will be
        merged into a single output document.
        id (unicode): The nidaba batch identifier this task is a part of
                      language (unicode): Language used for spell-checking
                      based scoring. If not defined no scoring will be used.
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    # create the output document path from the first input document
    input_path = storage.get_abs_path(*docs[0])
    output_path = storage.insert_suffix(input_path, method)
    return merge_hocr.merge(docs, language,
                            storage.get_storage_path(output_path))
示例#34
0
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True):
    """
    Extracts self reported confidence values from input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()])
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edist))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
示例#35
0
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True):
    """
    Extracts self reported confidence values from input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()])
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edist))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
示例#36
0
文件: cli.py 项目: ryanfb/nidaba
def batch(files, host, preprocessing, binarize, ocr, segmentation, stats,
          postprocessing, output, grayscale, help_tasks):
    """
    Add a new job to the pipeline.
    """
   
    if host:
        batch = NetworkSimpleBatch(host)
        click.echo(u'Preparing filestore\t\t[', nl=False)
        try:
            batch.create_batch()
        except:
            click.secho(u'\u2717', fg='red', nl=False)
            click.echo(']')
            exit()
        click.secho(u'\u2713', fg='green', nl=False)
        click.echo(']')
        for doc in files:
            def callback(monitor):
                spin(u'Uploading {}'.format(doc))
            batch.add_document(doc, callback)
            click.secho(u'\b\u2713', fg='green', nl=False)
            click.echo('\033[?25h\n', nl=False)
    else:
        from nidaba import storage
        click.echo(u'Preparing filestore\t\t[', nl=False)
        try:
            batch = SimpleBatch()
        except:
            click.secho(u'\u2717', fg='red', nl=False)
            click.echo(']')
            exit()
        for doc in files:
            shutil.copy2(doc, storage.get_abs_path(batch.id, os.path.basename(doc)))
            batch.add_document((batch.id, os.path.basename(doc)))
        click.secho(u'\u2713', fg='green', nl=False)
        click.echo(']')
    click.echo(u'Building batch\t\t\t[', nl=False)
    if not grayscale:
        batch.add_task('img', 'rgb_to_gray')
    if preprocessing:
        for alg in preprocessing:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('img', alg[0], **kwargs)
    if binarize:
        for alg in binarize:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('binarize', alg[0], **kwargs)
    if segmentation:
        for alg in segmentation:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('segmentation', alg[0], **kwargs)
    if ocr:
        for alg in ocr:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('ocr', alg[0], **kwargs)
    if stats:
        for alg in stats:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('stats', alg[0], **kwargs)
    if postprocessing:
        for alg in postprocessing:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('postprocessing', alg[0], **kwargs)
    if output:
        for alg in output:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('output', alg[0], **kwargs)
    batch.run()
    click.secho(u'\u2713', fg='green', nl=False)
    click.echo(']')
    click.echo(batch.id)
示例#37
0
def status(verbose, host, job_id):
    """
    Diplays the status and results of jobs.
    """
    click.secho('Status:', underline=True, nl=False)
    if host:
        batch = NetworkSimpleBatch(host, job_id)
    else:
        try:
            batch = Batch(job_id)
        except NidabaInputException:
            click.echo(' UNKNOWN')
            return

    state = batch.get_extended_state()
    if not state:
        click.echo(' UNKNOWN')
        return

    bs = 'success'
    done = 0
    running = 0
    pending = 0
    failed = 0
    results = []
    errors = []
    expected = len(state)
    for task_id, subtask in state.iteritems():
        if subtask['state'] == 'SUCCESS':
            done += 1
        elif subtask['state'] == 'RUNNING':
            running += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'PENDING':
            pending += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'FAILURE':
            errors.append(subtask)
            bs = 'failed'

        # leaf nodes/result extraction
        if len(subtask['children']) == 0 and subtask['result'] is not None:
            # try to find statistics results
            parents = [task_id] + subtask['parents']
            misc = None
            for parent in parents:
                parents.extend(state[parent]['parents'])
                if 'misc' in state[parent]:
                    misc = state[parent]['misc']
                    break
            # archival tasks bunch everything together. do a sort-based matching of input and output tasks
            if isinstance(subtask['result'][0],
                          list) or host and len(subtask['result']) > 1:
                for res, rd in zip(sorted(subtask['result']),
                                   sorted(subtask['root_documents'])):
                    if host:
                        res = [res]
                    results.append((res, [rd], misc))
            else:
                results.append(
                    (subtask['result'], subtask['root_documents'], misc))

    final = '(final)' if expected - done == 0 else ''
    click.echo(' {} {}\n'.format(bs, final))
    click.echo('{}/{} tasks completed. {} running.\n'.format(
        done, expected, running))

    # render results
    click.secho('Output files:\n', underline=True)
    results = sorted(results,
                     key=lambda x: x[0][0][1]
                     if isinstance(x[0], list) else x[0][1])
    if results:
        for doc in results:
            if host:
                output = ', '.join(doc[0])
                input = ', '.join(doc[1])
            else:
                from nidaba import storage
                if isinstance(doc[0][0], list):
                    for d in doc:
                        output = ', '.join(
                            click.format_filename(storage.get_abs_path(*d)))
                else:
                    output = click.format_filename(
                        storage.get_abs_path(*doc[0]))
                input = ', '.join(d[1] for d in doc[1])
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(
                    input, output, 100 * doc[2]['edit_ratio'],
                    doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(input, output))

    # render errors
    if errors:
        click.secho('\nErrors:\n', underline=True)
        for task in errors:
            tb = ''
            args = ''
            if verbose > 0:
                tb = task['errors'][2]
            if verbose > 1:
                task['errors'][0].pop('method')
                args = ', ' + str(task['errors'][0])
            if host:
                rd = ', '.join(
                    os.path.basename(x) for x in task['root_documents'])
            else:
                rd = ', '.join(
                    os.path.basename(x[1]) for x in task['root_documents'])
            click.echo('{}.{} ({}{}): {}{}'.format(task['task'][0],
                                                   task['task'][1], rd, args,
                                                   tb, task['errors'][1]))
示例#38
0
def merge(docs, lang, output):
    """
    Merges multiple hOCR documents into a single one.

    First bboxes from all documents are roughly matched, then all matching
    bboxes are scored using a spell checker. If no spell checker is available
    all matches will be merged without ranking.

    The matching is naive, i.e. we just grab the first input document and
    assume that all other documents have similar segmentation results. Issues
    like high variance in segmentation, especially word boundaries are not
    accounted for.

    Args:
        docs (iterable): A list of storage tuples of input documents
        lang (unicode): A language identifier for the spell checker
        output (tuple): Storage tuple for the result

    Returns:
        tuple: The output storage tuple. Should be the same as ```output```.
    """
    parser = etree.HTMLParser()
    tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1]), parser)
    lines_1, words_1 = get_hocr_lines_for_tree(tree1)
    sort_words_bbox(words_1)
    other_words = []
    for doc in docs[1:]:
        try:
            tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1]), parser)
            lines_2, words_2 = get_hocr_lines_for_tree(tree2)
            other_words = other_words + words_2
        except Exception as e:
            print(e)

    sort_words_bbox(other_words)
    positional_lists = []
    positional_list = []
    x = 0

    # Make a list of positional_lists, that is alternatives for a given
    # position, skipping duplicate position-words
    while x < len(other_words):
        try:
            if len(positional_list) == 0:
                positional_list.append(other_words[x])
            else:
                if close_enough(other_words[x - 1].bbox, other_words[x].bbox):
                    # skip if the text is the same, so that we just get unique
                    # texts for this position
                    if not other_words[x - 1].text == other_words[x].text:
                        positional_list.append(other_words[x])
                else:
                    if not x == 0:
                        positional_lists.append(positional_list)
                        positional_list = []
        except IndexError:
            pass
        x = x + 1

    # we now have a list of list of unique words for each position
    # let's select from each the first one that passes spellcheck
    replacement_words = []

    # make a 'replacement_words' list with all of the best, non-zero-scoring
    # suggestions for each place
    for positional_list in positional_lists:
        for word in positional_list:
            word.score = score_word(lang, word.text)
        positional_list.sort(key=attrgetter('score'), reverse=True)
        if positional_list[0].score > 0:
            replacement_words.append(positional_list[0])

    # now replace the originals
    for word in words_1:
        for replacement_word in replacement_words:
            word.score = score_word(lang, word.text)
            if close_enough(word.bbox, replacement_word.bbox) and (
                    word.score < replacement_word.score):
                word.element.text = replacement_word.text

        for positional_list in positional_lists:
            print("##")
            for word in positional_list:
                print(word.bbox, word.text)

    storage.write_text(*output,
                       text=etree.tostring(tree1.getroot(),
                                           encoding='unicode'))
    return output
示例#39
0
def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path,
                                                 method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path)
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException(
                'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path)

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(
        w, h, *doc))
    tei = OCRRecord()
    tei.dimensions = (w, h)
    tei.img = storage.get_url(*doc)
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it, RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))

        tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([{
            'grapheme': '',
            'bbox': (x0.value, y0.value, x1.value, y1.value)
        }])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(
            x0.value, y0.value, x1.value, y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write_tei(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path)
示例#40
0
文件: stats.py 项目: amitdo/nidaba
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None,
                    xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
示例#41
0
def setup(*args, **kwargs):
    if kwargs.get(u'implementation'):
        global implementation
        implementation = kwargs.get(u'implementation')
    if kwargs.get(u'tessdata'):
        global tessdata
        if isinstance(kwargs.get(u'tessdata'), list):
            tessdata = storage.get_abs_path(*kwargs.get(u'tessdata'))
        else:
            tessdata = kwargs.get(u'tessdata')
    if implementation == 'direct' and not spawn.find_executable('tesseract'):
        raise NidabaPluginException('No tesseract executable found')
    if implementation == 'capi':
        try:
            global tesseract, leptonica
            tesseract = ctypes.cdll.LoadLibrary('libtesseract.so.3')
            leptonica = ctypes.cdll.LoadLibrary('liblept.so')
        except:
            raise NidabaPluginException(
                'Loading libtesseract/leptonica failed.')

        tesseract.TessBaseAPICreate.restype = POINTER(TessBaseAPI)

        tesseract.TessBaseAPIEnd.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIEnd.restype = None

        tesseract.TessBaseAPIDelete.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIDelete.restype = None

        tesseract.TessBaseAPIInit3.argtypes = [
            POINTER(TessBaseAPI), ctypes.c_char_p, ctypes.c_char_p
        ]
        tesseract.TessBaseAPIInit3.restype = ctypes.c_int

        tesseract.TessBaseAPISetImage2.restype = None
        tesseract.TessBaseAPISetImage2.argtypes = [
            POINTER(TessBaseAPI), POINTER(Pix)
        ]

        tesseract.TessBaseAPIRecognize.argtypes = [
            POINTER(TessBaseAPI), POINTER(TessBaseAPI)
        ]
        tesseract.TessBaseAPIRecognize.restype = ctypes.c_int

        tesseract.TessResultIteratorGetUTF8Text.restype = ctypes.c_char_p
        tesseract.TessResultIteratorGetUTF8Text.argtypes = [
            POINTER(TessResultIterator), ctypes.c_int
        ]

        tesseract.TessResultIteratorConfidence.argtypes = [
            POINTER(TessResultIterator), ctypes.c_int
        ]
        tesseract.TessResultIteratorConfidence.restype = ctypes.c_float

        tesseract.TessResultIteratorWordRecognitionLanguage.argtypes = [
            POINTER(TessResultIterator)
        ]
        tesseract.TessResultIteratorWordRecognitionLanguage.restype = ctypes.c_char_p

        tesseract.TessVersion.restype = ctypes.c_char_p

        tesseract.TessBaseAPISetPageSegMode.argtypes = [
            POINTER(TessBaseAPI), ctypes.c_int
        ]
        tesseract.TessBaseAPISetPageSegMode.restype = None

        tesseract.TessBaseAPIProcessPages.argtypes = [
            POINTER(TessBaseAPI), ctypes.c_char_p, ctypes.c_char_p,
            ctypes.c_int,
            POINTER(TessResultRenderer)
        ]
        tesseract.TessBaseAPIProcessPages.restype = ctypes.c_int

        tesseract.TessBaseAPIAnalyseLayout.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIAnalyseLayout.restype = POINTER(TessPageIterator)

        tesseract.TessPageIteratorIsAtBeginningOf.argtypes = [
            POINTER(TessPageIterator), ctypes.c_int
        ]
        tesseract.TessPageIteratorIsAtBeginningOf.restype = ctypes.c_int

        tesseract.TessPageIteratorBoundingBox.argtypes = [
            POINTER(TessPageIterator), ctypes.c_int,
            POINTER(ctypes.c_int),
            POINTER(ctypes.c_int),
            POINTER(ctypes.c_int),
            POINTER(ctypes.c_int)
        ]
        tesseract.TessPageIteratorBoundingBox.restype = ctypes.c_int

        tesseract.TessBaseAPIGetIterator.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIGetIterator.restype = POINTER(TessResultIterator)

        tesseract.TessResultIteratorGetPageIterator.argtypes = [
            POINTER(TessResultIterator)
        ]
        tesseract.TessResultIteratorGetPageIterator.restype = POINTER(
            TessPageIterator)

        tesseract.TessResultIteratorNext.argtypes = [
            POINTER(TessResultIterator), ctypes.c_int
        ]
        tesseract.TessResultIteratorNext.restype = ctypes.c_int

        tesseract.TessResultIteratorDelete.argtypes = [
            POINTER(TessResultIterator)
        ]
        tesseract.TessResultIteratorDelete.restype = None

        tesseract.TessPageIteratorDelete.argtypes = [POINTER(TessPageIterator)]
        tesseract.TessPageIteratorDelete.restype = None

        tesseract.TessDeleteText.argtypes = [POINTER(ctypes.c_char)]
        tesseract.TessDeleteText.restype = None

        leptonica.pixRead.argtypes = [ctypes.c_char_p]
        leptonica.pixRead.restype = POINTER(Pix)

        leptonica.pixDestroy.argtypes = [POINTER(POINTER(Pix))]
        leptonica.pixDestroy.restype = None

        tesseract.TessBaseAPIGetHOCRText.argtypes = [
            POINTER(TessBaseAPI), ctypes.c_int
        ]
        tesseract.TessBaseAPIGetHOCRText.restype = POINTER(ctypes.c_char)

        tesseract.TessBaseAPIGetAvailableLanguagesAsVector.argtypes = [
            POINTER(TessBaseAPI)
        ]
        tesseract.TessBaseAPIGetAvailableLanguagesAsVector.restype = POINTER(
            ctypes.c_char_p)

        # fill in available tesseract classifiers as they are only determinable
        # after setting the tessdata directory.
        ocr_tesseract.arg_values['languages'] = _get_available_classifiers()
示例#42
0
文件: kraken.py 项目: ryanfb/nidaba
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1],
                                                                     method,
                                                                     model))[0]
                   + '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = OCRRecord()
    with storage.StorageFile(*doc[0]) as seg:
        tei.load_tei(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for line_id, rec in zip(lines, rpred.rpred(rnn, img, [x['bbox'] for x in lines.itervalues()])):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)])
                logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)]))
                tei.add_graphemes([{'grapheme': x[0], 
                                    'bbox': x[1],
                                    'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([{'grapheme': x[0], 
                                    'bbox': x[1],
                                    'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return output_path
示例#43
0
def text_diff_ratio(doc,
                    method=u'text_diff_ratio',
                    ground_truth=None,
                    xml_in=True,
                    gt_format=u'tei',
                    clean_in=True,
                    clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format +
                                                  ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {
            'diff_ratio': sm.ratio(),
            'ground_truth': ground_truth,
            'doc': doc
        }
示例#44
0
def batch(files, host, preprocessing, binarize, ocr, segmentation, stats,
          postprocessing, output, grayscale, help_tasks):
    """
    Add a new job to the pipeline.
    """

    if host:
        batch = NetworkSimpleBatch(host)
        click.echo(u'Preparing filestore\t\t[', nl=False)
        try:
            batch.create_batch()
        except:
            click.secho(u'\u2717', fg='red', nl=False)
            click.echo(']')
            exit()
        click.secho(u'\u2713', fg='green', nl=False)
        click.echo(']')
        for doc in files:

            def callback(monitor):
                spin(u'Uploading {}'.format(doc))

            batch.add_document(doc, callback)
            click.secho(u'\b\u2713', fg='green', nl=False)
            click.echo('\033[?25h\n', nl=False)
    else:
        from nidaba import storage
        click.echo(u'Preparing filestore\t\t[', nl=False)
        try:
            batch = SimpleBatch()
        except:
            click.secho(u'\u2717', fg='red', nl=False)
            click.echo(']')
            exit()
        for doc in files:
            shutil.copy2(doc,
                         storage.get_abs_path(batch.id, os.path.basename(doc)))
            batch.add_document((batch.id, os.path.basename(doc)))
        click.secho(u'\u2713', fg='green', nl=False)
        click.echo(']')
    click.echo(u'Building batch\t\t\t[', nl=False)
    if not grayscale:
        batch.add_task('img', 'rgb_to_gray')
    if preprocessing:
        for alg in preprocessing:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('img', alg[0], **kwargs)
    if binarize:
        for alg in binarize:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('binarize', alg[0], **kwargs)
    if segmentation:
        for alg in segmentation:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('segmentation', alg[0], **kwargs)
    if ocr:
        for alg in ocr:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('ocr', alg[0], **kwargs)
    if stats:
        for alg in stats:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('stats', alg[0], **kwargs)
    if postprocessing:
        for alg in postprocessing:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('postprocessing', alg[0], **kwargs)
    if output:
        for alg in output:
            for kwargs in alg[1]:
                kwargs = move_to_storage(batch, kwargs)
                batch.add_task('output', alg[0], **kwargs)
    batch.run()
    click.secho(u'\u2713', fg='green', nl=False)
    click.echo(']')
    click.echo(batch.id)
示例#45
0
def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path), doc
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path), doc

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc))
    tei = TEIFacsimile()
    tei.document((w, h), os.path.join(*doc))
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))

        tesseract.TessPageIteratorBoundingBox(it,
                                              RIL_SYMBOL,
                                              ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([(None, (x0.value, y0.value, x1.value, y1.value))])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value,
                                                                        y0.value,
                                                                        x1.value,
                                                                        y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path), doc
示例#46
0
def setup(*args, **kwargs):
    if kwargs.get(u'implementation'):
        global implementation
        implementation = kwargs.get(u'implementation')
    if kwargs.get(u'tessdata'):
        global tessdata
        if isinstance(kwargs.get(u'tessdata'), list):
            tessdata = storage.get_abs_path(*kwargs.get(u'tessdata'))
        else:
            tessdata = kwargs.get(u'tessdata')
    if implementation == 'direct' and not spawn.find_executable('tesseract'):
        raise NidabaPluginException('No tesseract executable found')
    if implementation == 'capi':
        try:
            global tesseract, leptonica
            tesseract = ctypes.cdll.LoadLibrary('libtesseract.so.3')
            leptonica = ctypes.cdll.LoadLibrary('liblept.so')
        except:
            raise NidabaPluginException('Loading libtesseract/leptonica failed.')

        tesseract.TessBaseAPICreate.restype = POINTER(TessBaseAPI)

        tesseract.TessBaseAPIEnd.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIEnd.restype = None

        tesseract.TessBaseAPIDelete.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIDelete.restype = None

        tesseract.TessBaseAPIInit3.argtypes = [POINTER(TessBaseAPI),
                                               ctypes.c_char_p,
                                               ctypes.c_char_p]
        tesseract.TessBaseAPIInit3.restype = ctypes.c_int

        tesseract.TessBaseAPISetImage2.restype = None
        tesseract.TessBaseAPISetImage2.argtypes = [POINTER(TessBaseAPI),
                                                   POINTER(Pix)]

        tesseract.TessBaseAPIRecognize.argtypes = [POINTER(TessBaseAPI), POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIRecognize.restype = ctypes.c_int

        tesseract.TessResultIteratorGetUTF8Text.restype = ctypes.c_char_p
        tesseract.TessResultIteratorGetUTF8Text.argtypes = [POINTER(TessResultIterator),
                                                            ctypes.c_int]

        tesseract.TessResultIteratorConfidence.argtypes = [POINTER(TessResultIterator),
                                                           ctypes.c_int]
        tesseract.TessResultIteratorConfidence.restype = ctypes.c_float

        tesseract.TessResultIteratorWordRecognitionLanguage.argtypes = [POINTER(TessResultIterator)]
        tesseract.TessResultIteratorWordRecognitionLanguage.restype = ctypes.c_char_p

        tesseract.TessVersion.restype = ctypes.c_char_p

        tesseract.TessBaseAPISetPageSegMode.argtypes = [POINTER(TessBaseAPI),
                                                        ctypes.c_int]
        tesseract.TessBaseAPISetPageSegMode.restype = None

        tesseract.TessBaseAPIProcessPages.argtypes = [POINTER(TessBaseAPI),
                                                      ctypes.c_char_p,
                                                      ctypes.c_char_p,
                                                      ctypes.c_int,
                                                      POINTER(TessResultRenderer)]
        tesseract.TessBaseAPIProcessPages.restype = ctypes.c_int

        tesseract.TessBaseAPIAnalyseLayout.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIAnalyseLayout.restype = POINTER(TessPageIterator)

        tesseract.TessPageIteratorIsAtBeginningOf.argtypes = [POINTER(TessPageIterator),
                                                              ctypes.c_int]
        tesseract.TessPageIteratorIsAtBeginningOf.restype = ctypes.c_int

        tesseract.TessPageIteratorBoundingBox.argtypes = [POINTER(TessPageIterator),
                                                          ctypes.c_int,
                                                          POINTER(ctypes.c_int),
                                                          POINTER(ctypes.c_int),
                                                          POINTER(ctypes.c_int),
                                                          POINTER(ctypes.c_int)]
        tesseract.TessPageIteratorBoundingBox.restype = ctypes.c_int

        tesseract.TessBaseAPIGetIterator.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIGetIterator.restype = POINTER(TessResultIterator)

        tesseract.TessResultIteratorGetPageIterator.argtypes = [POINTER(TessResultIterator)]
        tesseract.TessResultIteratorGetPageIterator.restype = POINTER(TessPageIterator)

        tesseract.TessResultIteratorNext.argtypes = [POINTER(TessResultIterator), ctypes.c_int]
        tesseract.TessResultIteratorNext.restype = ctypes.c_int

        tesseract.TessResultIteratorDelete.argtypes = [POINTER(TessResultIterator)]
        tesseract.TessResultIteratorDelete.restype = None

        tesseract.TessPageIteratorDelete.argtypes = [POINTER(TessPageIterator)]
        tesseract.TessPageIteratorDelete.restype = None

        leptonica.pixRead.argtypes = [ctypes.c_char_p]
        leptonica.pixRead.restype = POINTER(Pix)

        leptonica.pixDestroy.argtypes = [POINTER(POINTER(Pix))]
        leptonica.pixDestroy.restype = None

        tesseract.TessBaseAPIGetHOCRText.argtypes = [POINTER(TessBaseAPI), ctypes.c_int]
        tesseract.TessBaseAPIGetHOCRText.restype = ctypes.c_char_p

        tesseract.TessBaseAPIGetAvailableLanguagesAsVector.argtypes = [POINTER(TessBaseAPI)]
        tesseract.TessBaseAPIGetAvailableLanguagesAsVector.restype = POINTER(ctypes.c_char_p)

        # fill in available tesseract classifiers as they are only determinable
        # after setting the tessdata directory.
        ocr_tesseract.arg_values['languages'] = _get_available_classifiers()
示例#47
0
文件: cli.py 项目: ryanfb/nidaba
def status(verbose, host, job_id):
    """
    Diplays the status and results of jobs.
    """
    if host:
        batch = NetworkSimpleBatch(host, job_id)
    else:
        batch = SimpleBatch(job_id)

    state = batch.get_extended_state()

    click.secho('Status:', underline=True, nl=False)
    if not state:
        click.echo(' UNKNOWN')
        return

    bs = 'success'
    done = 0
    running = 0
    pending = 0
    failed = 0
    results = []
    errors = []
    expected = len(state)
    failed_children = set()
    for task_id, subtask in state.iteritems():
        if subtask['state'] == 'SUCCESS':
            done += 1
        elif subtask['state'] == 'RUNNING':
            running += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'PENDING':
            pending += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'FAILURE':
            failed += 1
            children = []
            if not isinstance(subtask['children'], list):
                subtask['children'] = [subtask['children']]
            for child in subtask['children']:
                if not isinstance(state[child]['children'], list):
                    state[child]['children'] = [state[child]['children']]
                children.extend(state[child]['children'])
                failed_children.add(child)
            errors.append(subtask)
            bs = 'failed'

        if len(subtask['children']) == 0 and not subtask['housekeeping'] and subtask['result'] is not None:
            # try to find statistics results
            parents = [task_id] + subtask['parents']
            misc = None
            for parent in parents:
                parents.extend(state[parent]['parents'])
                if 'misc' in state[parent]:
                    misc = state[parent]['misc']
                    break
            results.append((subtask['result'], subtask['root_document'], misc))

    final = '(final)' if not expected - failed - done - len(failed_children) else ''
    click.echo(' {} {}\n'.format(bs, final))
    click.echo('{}/{} tasks completed. {} running.\n'.format(done, len(state), running))
    click.secho('Output files:\n', underline=True)
    results = sorted(results, key=lambda x: x[0][1])
    if results and host:
        for doc in results:
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(doc[1], 
                                                                 doc[0],
                                                                 100 *
                                                                 doc[2]['edit_ratio'],
                                                                 doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(doc[1], doc[0]))
    elif results:
        from nidaba import storage
        for doc in results:
            output = click.format_filename(storage.get_abs_path(*doc[0]))
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(doc[1][1], 
                                                                 output,
                                                                 100 *
                                                                 doc[2]['edit_ratio'],
                                                                 doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(doc[1][1], output))
    if errors:
        click.secho('\nErrors:\n', underline=True)
        for task in errors:
            tb = ''
            args = ''
            if verbose > 0:
                tb = task['errors'][2]
            if verbose > 1:
                task['errors'][0].pop('method')
                args = ', ' + str(task['errors'][0])
            click.echo('{0} ({1}{2}): {3}{4}'.format(task['task'][0],
                                                     task['root_document'][1],
                                                     args,
                                                     tb,
                                                     task['errors'][1]))
示例#48
0
def status(verbose, host, job_id):
    """
    Diplays the status and results of jobs.
    """
    if host:
        batch = NetworkSimpleBatch(host, job_id)
    else:
        batch = SimpleBatch(job_id)

    state = batch.get_extended_state()

    click.secho('Status:', underline=True, nl=False)
    if not state:
        click.echo(' UNKNOWN')
        return

    bs = 'success'
    done = 0
    running = 0
    pending = 0
    failed = 0
    results = []
    errors = []
    expected = len(state)
    failed_children = set()
    for task_id, subtask in state.iteritems():
        if subtask['state'] == 'SUCCESS':
            done += 1
        elif subtask['state'] == 'RUNNING':
            running += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'PENDING':
            pending += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'FAILURE':
            failed += 1
            children = []
            if not isinstance(subtask['children'], list):
                subtask['children'] = [subtask['children']]
            for child in subtask['children']:
                if not isinstance(state[child]['children'], list):
                    state[child]['children'] = [state[child]['children']]
                children.extend(state[child]['children'])
                failed_children.add(child)
            errors.append(subtask)
            bs = 'failed'

        if len(subtask['children']) == 0 and not subtask[
                'housekeeping'] and subtask['result'] is not None:
            # try to find statistics results
            parents = [task_id] + subtask['parents']
            misc = None
            for parent in parents:
                parents.extend(state[parent]['parents'])
                if 'misc' in state[parent]:
                    misc = state[parent]['misc']
                    break
            results.append((subtask['result'], subtask['root_document'], misc))

    final = '(final)' if not expected - failed - done - len(
        failed_children) else ''
    click.echo(' {} {}\n'.format(bs, final))
    click.echo('{}/{} tasks completed. {} running.\n'.format(
        done, len(state), running))
    click.secho('Output files:\n', underline=True)
    results = sorted(results, key=lambda x: x[0][1])
    if results and host:
        for doc in results:
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(
                    doc[1], doc[0], 100 * doc[2]['edit_ratio'],
                    doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(doc[1], doc[0]))
    elif results:
        from nidaba import storage
        for doc in results:
            output = click.format_filename(storage.get_abs_path(*doc[0]))
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(
                    doc[1][1], output, 100 * doc[2]['edit_ratio'],
                    doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(doc[1][1], output))
    if errors:
        click.secho('\nErrors:\n', underline=True)
        for task in errors:
            tb = ''
            args = ''
            if verbose > 0:
                tb = task['errors'][2]
            if verbose > 1:
                task['errors'][0].pop('method')
                args = ', ' + str(task['errors'][0])
            click.echo('{0} ({1}{2}): {3}{4}'.format(task['task'][0],
                                                     task['root_document'][1],
                                                     args, tb,
                                                     task['errors'][1]))
示例#49
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    output_path = (
        doc[0],
        os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] +
        '.xml')
    logger.debug('Loading model {}'.format(model))
    try:
        rnn = models.load_any(mod_db[model])
    except Exception as e:
        raise NidabaInvalidParameterException(str(e))
    logger.debug('Reading TEI segmentation from {}'.format(doc))
    tei = OCRRecord()
    with storage.StorageFile(*doc) as seg:
        tei.load_tei(seg)

    img = Image.open(
        storage.get_abs_path(*storage.get_storage_path_url(tei.img)))
    if is_bitonal(img):
        img = img.convert('1')
    else:
        raise NidabaInvalidParameterException('Input image is not bitonal')

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    i = 0
    rnn = models.load_any(mod_db[model])
    logger.debug('Start recognizing characters')
    for line_id, rec in izip(
            lines,
            rpred.rpred(
                rnn, img, {
                    'text_direction': 'horizontal-tb',
                    'boxes': [list(x['bbox']) for x in lines.itervalues()]
                })):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return output_path
示例#50
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (
        doc[1][0],
        os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] +
        '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as seg:
        tei.read(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for rec in rpred.rpred(rnn, img,
                           [(int(x[0]), int(x[1]), int(x[2]), int(x[3]))
                            for x in lines]):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(lines[i][4]))
        tei.scope_line(lines[i][4])
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(segment)]
                ])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(whitespace)]
                ])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path
示例#51
0
def status(verbose, host, job_id):
    """
    Diplays the status and results of jobs.
    """
    click.secho('Status:', underline=True, nl=False)
    if host:
        batch = NetworkSimpleBatch(host, job_id)
    else:
        try:
            batch = Batch(job_id)
        except NidabaInputException:
            click.echo(' UNKNOWN')
            return

    state = batch.get_extended_state()
    if not state:
        click.echo(' UNKNOWN')
        return

    bs = 'success'
    done = 0
    running = 0
    pending = 0
    failed = 0
    results = []
    errors = []
    expected = len(state)
    for task_id, subtask in state.iteritems():
        if subtask['state'] == 'SUCCESS':
            done += 1
        elif subtask['state'] == 'RUNNING':
            running += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'PENDING':
            pending += 1
            if bs == 'success':
                bs = 'pending'
        elif subtask['state'] == 'FAILURE':
            errors.append(subtask)
            bs = 'failed'

        # leaf nodes/result extraction
        if len(subtask['children']) == 0 and subtask['result'] is not None:
            # try to find statistics results
            parents = [task_id] + subtask['parents']
            misc = None
            for parent in parents:
                parents.extend(state[parent]['parents'])
                if 'misc' in state[parent]:
                    misc = state[parent]['misc']
                    break
            # archival tasks bunch everything together. do a sort-based matching of input and output tasks
            if isinstance(subtask['result'][0], list) or (host and isinstance(subtask['result'], list)):
                for res, rd in zip(sorted(subtask['result']), sorted(subtask['root_documents'])):
                    if host:
                        res = [res]
                    results.append((res, [rd], misc))
            else:
                if host:
                    subtask['result'] = [subtask['result']]
                results.append((subtask['result'], subtask['root_documents'], misc))

    final = '(final)' if expected - done == 0 else ''
    click.echo(' {} {}\n'.format(bs, final))
    click.echo('{}/{} tasks completed. {} running.\n'.format(done, expected, running))

    # render results
    click.secho('Output files:\n', underline=True)
    results = sorted(results, key=lambda x: x[0][0][1] if isinstance(x[0], list) else x[0][1])
    if results:
        for doc in results:
            if host:
                output = ', '.join(doc[0])
                input = ', '.join(doc[1])
            else:
                from nidaba import storage
                if isinstance(doc[0][0], list):
                    for d in doc:
                        output = ', '.join(click.format_filename(storage.get_abs_path(*d)))
                else:
                    output = click.format_filename(storage.get_abs_path(*doc[0]))
                input = ', '.join(d[1] for d in doc[1])
            if doc[2] is not None:
                click.echo(u'{} \u2192 {} ({:.1f}% / {})'.format(input,
                                                                 output,
                                                                 100 *
                                                                 doc[2]['edit_ratio'],
                                                                 doc[2]['ground_truth'][1]))
            else:
                click.echo(u'{} \u2192 {}'.format(input, output))

    # render errors
    if errors:
        click.secho('\nErrors:\n', underline=True)
        for task in errors:
            tb = ''
            args = ''
            if verbose > 0:
                tb = task['errors'][2]
            if verbose > 1:
                task['errors'][0].pop('method')
                args = ', ' + str(task['errors'][0])
            if host:
                rd = ', '.join(os.path.basename(x) for x in task['root_documents'])
            else:
                rd = ', '.join(os.path.basename(x[1]) for x in task['root_documents'])
            click.echo('{}.{} ({}{}): {}{}'.format(task['task'][0],
                                                   task['task'][1],
                                                   rd,
                                                   args,
                                                   tb,
                                                   task['errors'][1]))
示例#52
0
def merge(docs, lang, output):
    """
    Merges multiple hOCR documents into a single one.

    First bboxes from all documents are roughly matched, then all matching
    bboxes are scored using a spell checker. If no spell checker is available
    all matches will be merged without ranking.

    The matching is naive, i.e. we just grab the first input document and
    assume that all other documents have similar segmentation results. Issues
    like high variance in segmentation, especially word boundaries are not
    accounted for.

    Args:
        docs (iterable): A list of storage tuples of input documents
        lang (unicode): A language identifier for the spell checker
        output (tuple): Storage tuple for the result

    Returns:
        tuple: The output storage tuple. Should be the same as ```output```.
    """
    tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1]))
    lines_1, words_1 = get_hocr_lines_for_tree(tree1)
    sort_words_bbox(words_1)
    other_words = []
    for doc in docs[1:]:
        try:
            tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1]))
            lines_2, words_2 = get_hocr_lines_for_tree(tree2)
            other_words = other_words + words_2
        except Exception as e:
            print e

    sort_words_bbox(other_words)
    positional_lists = []
    positional_list = []
    x = 0

    # Make a list of positional_lists, that is alternatives for a given
    # position, skipping duplicate position-words
    while x < len(other_words):
        try:
            if len(positional_list) == 0:
                positional_list.append(other_words[x])
            else:
                if close_enough(other_words[x - 1].bbox, other_words[x].bbox):
                    # skip if the text is the same, so that we just get unique
                    # texts for this position
                    if not other_words[x - 1].text == other_words[x].text:
                        positional_list.append(other_words[x])
                else:
                    if not x == 0:
                        positional_lists.append(positional_list)
                        positional_list = []
        except IndexError:
            pass
        x = x + 1

    # we now have a list of list of unique words for each position
    # let's select from each the first one that passes spellcheck
    replacement_words = []

    # make a 'replacement_words' list with all of the best, non-zero-scoring
    # suggestions for each place
    for positional_list in positional_lists:
        for word in positional_list:
            word.score = score_word(lang, word.text)
        positional_list.sort(key=attrgetter('score'), reverse=True)
        if positional_list[0].score > 0:
            replacement_words.append(positional_list[0])

    # now replace the originals
    for word in words_1:
        for replacement_word in replacement_words:
            word.score = score_word(lang, word.text)
            if close_enough(word.bbox, replacement_word.bbox) and (
                    word.score < replacement_word.score):
                word.element.text = replacement_word.text

        for positional_list in positional_lists:
            print "##"
            for word in positional_list:
                print word.bbox, word.text

    storage.write_text(*output, text=etree.tostring(tree1.getroot(),
                                                    encoding='unicode'))
    return output