示例#1
0
def ocr_tesseract(doc,
                  method=u'ocr_tesseract',
                  languages=None,
                  extended=False):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        languages (list): A list of tesseract classifier identifiers
        extended (bool): Switch to enable extended hOCR generation containing
                         character cuts and confidences. Has no effect when
                         direct or legacy implementation is used.

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    seg = OCRRecord()
    with storage.StorageFile(*doc) as fp:
        seg.load_tei(fp)
    with storage.StorageFile(doc[0], splitext(doc[1])[0] + '.uzn',
                             mode='wb') as fp:
        uzn = UZNWriter(fp)
        for line in seg.lines.itervalues():
            uzn.writerow(*line['bbox'])

    image_path = storage.get_abs_path(*storage.get_storage_path_url(seg.img))
    if isinstance(languages, basestring):
        languages = [languages]
    output_path = storage.insert_suffix(image_path, method, *languages)

    logger.debug(
        'Invoking tesseract with {} call method'.format(implementation))
    if implementation == 'legacy':
        result_path = output_path + '.html'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'direct':
        result_path = output_path + '.hocr'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'capi':
        result_path = output_path + '.xml'
        ocr_capi(image_path, result_path, seg, languages, extended)
    else:
        raise NidabaTesseractException('Invalid implementation selected',
                                       implementation)

    if not result_path[-4:] == '.xml':
        logger.debug('Converting hOCR ({}) -> TEI ({})'.format(
            result_path, output_path + '.xml'))
        tei = OCRRecord()
        with open(result_path) as fp:
            tei.load_hocr(fp)
        os.unlink(result_path)
        with open(output_path + '.xml', 'wb') as fp:
            tei.write_tei(fp)
        result_path = output_path + '.xml'
    return storage.get_storage_path(result_path)
示例#2
0
def archive_pybossa(doc, method=u'archive_pybossa', name='', description=''):
    """
    Adds recognition result to a pybossa service for postcorrection.

    Args:
        doc [(unicode, unicode), ...]: The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        The list of input storage tuples.
    """
    logger.debug('Creating pybossa project named {}'.format(name))
    proj = pbclient.create_project('{} ({})'.format(name, doc[0][0]), doc[0][0], description)
    logger.debug('Creating pybossa tasks for docs {}'.format(doc))
    for d in doc:
        data = tei.OCRRecord()
        with storage.StorageFile(*d, mode='rb') as fp:
            data.load_tei(fp)
            for line_id, line in data.lines.iteritems():
                text = u''
                for seg in line['content'].itervalues():
                    text += u''.join(x['grapheme'] for x in seg['content'].itervalues())
                pbclient.create_task(proj.id, {
                    'image': data.img,
                    'dimensions': data.dimensions,
                    'line_text': text.encode('utf-8'),
                    'bbox': [
                        line['bbox'][0],
                        line['bbox'][1],
                        line['bbox'][2],
                        line['bbox'][3]
                    ]
                })
    return doc
示例#3
0
文件: api.py 项目: mirskiy/nidaba
 def get(self, batch, file):
     """
     Retrieves the file at *file* in batch *batch*.
 
     ** Request **
 
     .. sourcecode:: http
 
         GET /pages/:batch/:path
 
     ** Response **
 
     .. sourcecode:: http
 
         HTTP/1.1 200 OK
         Content-Type: application/octet-stream
 
         ...
 
     :param batch: batch's unique id
     :type batch: str
     :param file: path to the batch's file
     :type file: path
     :status 200: No error
     :status 404: File not found
     """
     log.debug('routing to pages with URN: {}/{}'.format(batch, file))
     try:
         fp = storage.StorageFile(batch, file, 'rb')
     except:
         log.debug('File {} not found in {}'.format(file, batch))
         return {'message': 'File not found'}, 404
     return send_file(fp, mimetype=mimetypes.guess_type(file)[0])
示例#4
0
    def post(self, batch_id):
        """
        Adds a page (really any type of file) to the batch identified by
        *batch_id*.

        ** Request **

            POST /batch/:batch/pages

        ** Response **

            HTTP/1.1 201 OK

            [
                {
                    "name": "0033.tif",
                    "url": "/pages/63ca3ec7-2592-4c7d-9009-913aac42535d/0033.tif"
                }
            ]

        :form scans: file(s) to add to the batch

        :status 201: file created
        :status 403: file couldn't be created
        :status 404: batch not found
        """
        args = self.parser.parse_args()
        log.debug('Routing to pages {} of {} (POST)'.format(
            [x.filename for x in args['scans']], batch_id))
        try:
            batch = nBatch(batch_id)
        except:
            return {'message': 'Batch Not Found: {}'.format(batch_id)}, 404
        data = []
        for file in args['scans']:
            try:
                fp = storage.StorageFile(batch_id, file.filename, 'wb')
            except NidabaStorageViolationException as e:
                log.debug('Failed to write file {}'.format(file.filename),
                          exc_info=True)
                return {'message': str(e)}, 403
            else:
                with fp:
                    file.save(fp)
                    file.close()
                    if args['auxiliary'] is False:
                        log.debug('Adding {}/{} to {}'.format(
                            fp.storage_path[0], fp.storage_path[1], batch_id))
                        batch.add_document(fp.storage_path)
            data.append({
                'name':
                file.filename,
                'url':
                url_for('api.page', batch=batch_id, file=file.filename)
            })
        return data, 201
示例#5
0
文件: output.py 项目: lxj0276/nidaba
def tei2txt(doc, method=u'tei2txt'):
    """
    Convert a TEI Facsimile to a plain text file.

    Args:
        doc (unicode, unicode): Storage tuple of the input document

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.load_tei(fp)
    output_path = storage.insert_suffix(doc[1], method)
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing text to {}'.format(fp.abs_path))
        tei.write_text(fp)
    return (doc[0], output_path)
示例#6
0
文件: output.py 项目: lxj0276/nidaba
def tei2hocr(doc, method=u'tei2hocr'):
    """
    Convert a TEI Facsimile to hOCR preserving as much metadata as possible.

    Args:
        doc (unicode, unicode): Storage tuple of the input document

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.load_tei(fp)
    output_path = storage.insert_suffix(doc[1], method)
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing hOCR to {}'.format(fp.abs_path))
        tei.write_hocr(fp)
    return (doc[0], output_path)
示例#7
0
文件: output.py 项目: lxj0276/nidaba
def tei2abbyyxml(doc, method=u'abbyyxml'):
    """
    Convert a TEI Facsimile to a format similar to Abbyy FineReader's XML
    output.

    Args:
        doc (unicode, unicode): Storage tuple of the input document

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.load_tei(fp)
    output_path = storage.insert_suffix(doc[1], method)
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing abbyyxml to {}'.format(fp.abs_path))
        tei.write_abbyyxml(fp)
    return (doc[0], output_path)
示例#8
0
def spell_check(doc,
                method=u'spell_check',
                language=u'',
                filter_punctuation=False):
    """
    Adds spelling suggestions to an TEI XML document.

    Alternative spellings for each segment will be included in a choice
    tagcontaining a series of corr tags with the original segment appearing
    beneath a sic element.  Correct words, i.e. words appearing verbatim in the
    dictionary, are left untouched.

    Args:
        doc (unicode, unicode): The input document tuple.
        method (unicode): The suffix string appended to the output file.
        language (unicode): Identifier defined in the nidaba configuration as a
                            valid dictionary.
        filter_punctuation (bool): Switch to filter punctuation inside
                                   ``seg``
    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc)
    output_path = storage.insert_suffix(input_path, method, language,
                                        unicode(filter_punctuation))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    del_dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['deletion_dictionary'])
    with storage.StorageFile(*doc) as fp:
        logger.debug('Reading TEI ({})'.format(fp.abs_path))
        tei = TEIFacsimile()
        tei.read(fp)
        logger.debug('Performing spell check')
        ret = lex.tei_spellcheck(tei, dictionary, del_dictionary,
                                 filter_punctuation)
    with storage.StorageFile(*storage.get_storage_path(output_path),
                             mode='wb') as fp:
        logger.debug('Writing TEI ({})'.format(fp.abs_path))
        ret.write(fp)
    return storage.get_storage_path(output_path)
示例#9
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme']
                                   for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(
                tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {
            'edit_ratio': err_cnt / float(cnt),
            'ground_truth': '',
            'doc': doc
        }
示例#10
0
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True):
    """
    Extracts self reported confidence values from input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()])
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edist))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
示例#11
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    output_path = (
        doc[0],
        os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] +
        '.xml')
    logger.debug('Loading model {}'.format(model))
    try:
        rnn = models.load_any(mod_db[model])
    except Exception as e:
        raise NidabaInvalidParameterException(str(e))
    logger.debug('Reading TEI segmentation from {}'.format(doc))
    tei = OCRRecord()
    with storage.StorageFile(*doc) as seg:
        tei.load_tei(seg)

    img = Image.open(
        storage.get_abs_path(*storage.get_storage_path_url(tei.img)))
    if is_bitonal(img):
        img = img.convert('1')
    else:
        raise NidabaInvalidParameterException('Input image is not bitonal')

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    i = 0
    rnn = models.load_any(mod_db[model])
    logger.debug('Start recognizing characters')
    for line_id, rec in izip(
            lines,
            rpred.rpred(
                rnn, img, {
                    'text_direction': 'horizontal-tb',
                    'boxes': [list(x['bbox']) for x in lines.itervalues()]
                })):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(line_id))
        tei.scope_line(line_id)
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([{
                    'grapheme': x[0],
                    'bbox': x[1],
                    'confidence': int(x[2] * 100)
                } for x in rec[line_offset:line_offset + len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return output_path
示例#12
0
def text_diff_ratio(doc,
                    method=u'text_diff_ratio',
                    ground_truth=None,
                    xml_in=True,
                    gt_format=u'tei',
                    clean_in=True,
                    clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format +
                                                  ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {
            'diff_ratio': sm.ratio(),
            'ground_truth': ground_truth,
            'doc': doc
        }
示例#13
0
文件: output.py 项目: lxj0276/nidaba
def tei_metadata(doc, method=u'metadata', metadata=None, validate=False):
    """
    Enriches a TEI-XML document with various metadata from an user-supplied
    YAML file.

    The following fields may be contained in the metadata file with the bolded
    subset mandatory for a valid TEI-XML file. They are grouped by their place
    in the header. Unknown fields are ignored and input is escaped as to
    disable injection.

    Some element may also be extended by increasing their arity, the second
    value is then usually used as a global identifer/locator, i.e. an URL or
    authority control ID.

    titleStmt:

        * ``title``: Title of the resource
        * author: Name of the author of the resource (may be extended)
        * editor: Name of the editor, compiler, translator, etc. of the
                  resource (may be extended)
        * funder: Institution responsible for the funding of the text (may be
                  extended)
        * principal: PI responsible for the creation of the text (may be
                     extended)
        * sponsor: Name of the sponsoring institution (may be extended)
        * meeting: Conference/meeting resulting in the text (may be extended)

    editionStmt:

        * edition: Peculiarities to the underlying edition of the text

    publicationStmt:

        * ``licence``: Licence of the content (may be extended)
        * ``publisher``: Person or agency responsible for the publication of
                     the text (may be extended)
        * distributor: Person or agency responsible for the text's
                       distribution (may be extended)
        * authority: Authority responsible for making the work available
        * idno: Identifier of the publication (may be extended with the type of
                identifier)
        * pub_place: Place of publication
        * date: Date of publication

    seriesStmt:

        * series_title: Title of the series to which the publication belongs

    notesStmt:

        * note: Misc. notes about the text

    sourceDesc:

        * ``source_desc``: Description of the source document

    other:

        * lang: Abbreviation of the language used in the header

    There is a sample file from the OpenPhilology project in the example
    directory.

    Args:
        doc (unicode, unicode): Storage tuple of the input document
        method (unicode):
        metadata (unicode, unicode): Storage tuple of the metadata YAML file

    Returns:
        (unicode, unicode): Storage tuple of the output document

    Raises:
        NidabaTEIException if the resulting document is not TEI compatible and
        validation is enabled.
    """
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.load_tei(fp)
    logger.debug('Reading metadata file ({}/{})'.format(*metadata))
    with storage.StorageFile(*metadata) as fp:
        meta = yaml.safe_load(fp)
    for field in tei.fields:
        if field in meta:
            logger.debug('Adding field {} ({})'.format(field, meta[field]))
            setattr(tei, field, meta[field])
    if validate:
        raise NidabaTEIException('Validation not yet implemented.')
    output_path = storage.insert_suffix(doc[1], method, metadata[1])
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write_tei(fp)
    return (doc[0], output_path)
示例#14
0
def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (
        doc[1][0],
        os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] +
        '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as seg:
        tei.read(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for rec in rpred.rpred(rnn, img,
                           [(int(x[0]), int(x[1]), int(x[2]), int(x[3]))
                            for x in lines]):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(lines[i][4]))
        tei.scope_line(lines[i][4])
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(segment)]
                ])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(whitespace)]
                ])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path