Python TEIFacsimile.write 예제들, nidaba.tei.TEIFacsimile.write Python 예제들

예제 #1

0

파일 보기

파일: tesseract.py 프로젝트: amitdo/nidaba

def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None,
                  extended=False):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        languages (list): A list of tesseract classifier identifiers
        extended (bool): Switch to enable extended hOCR generation containing
                         character cuts and confidences. Has no effect when
                         direct or legacy implementation is used.

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    image_path = storage.get_abs_path(*doc[1])

    # rewrite the segmentation file to lines in UZN format
    logger.debug('Rewriting TEI ({}) -> UZN ({})'.format(doc[0][1],
                                                         splitext(doc[1][1])[0]
                                                         + '.uzn'))
    seg = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as fp:
        seg.read(fp)
    with storage.StorageFile(doc[1][0], splitext(doc[1][1])[0] + '.uzn', mode='wb') as fp:
        uzn = UZNWriter(fp)
        for line in seg.lines:
            uzn.writerow(*line[:4])

    if isinstance(languages, basestring):
        languages = [languages]
    output_path = storage.insert_suffix(image_path, method, *languages)

    logger.debug('Invoking tesseract with {} call method'.format(implementation))
    if implementation == 'legacy':
        result_path = output_path + '.html'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'direct':
        result_path = output_path + '.hocr'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'capi':
        result_path = output_path + '.xml'
        ocr_capi(image_path, result_path, seg, languages, extended)
    else:
        raise NidabaTesseractException('Invalid implementation selected',
                                       implementation)

    if not result_path[-4:] == '.xml':
        logger.debug('Converting hOCR ({}) -> TEI ({})'.format(result_path,
                                                               output_path +
                                                               '.xml'))
        tei = TEIFacsimile()
        with open(result_path) as fp:
            tei.load_hocr(fp)
        os.unlink(result_path)
        with open(output_path + '.xml', 'wb') as fp:
            tei.write(fp)
        result_path = output_path + '.xml'
    return storage.get_storage_path(result_path)

예제 #2

0

파일 보기

def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(
        storage.insert_suffix(input_path, method))
    logger.debug('Copying input image {} to {}'.format(input_path,
                                                       output_path))
    shutil.copy2(input_path, output_path + ext)
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(
            doc[1], *img.size))
        tei = TEIFacsimile()
        tei.document(img.size, os.path.join(*doc))
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps):
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.abs_path))
        tei.write(fp)
    return (storage.get_storage_path(output_path + '.xml'),
            storage.get_storage_path(output_path + ext))

예제 #3

0

파일 보기

파일: kraken.py 프로젝트: amitdo/nidaba

def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False):
    """
    Performs page segmentation using kraken's built-in algorithm and writes a
    skeleton TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        black_colseps (bool): Assume black column separator instead of white
        ones.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """

    input_path = storage.get_abs_path(*doc)
    output_path, ext = os.path.splitext(storage.insert_suffix(input_path,
                                        method))
    logger.debug('Copying input image {} to {}'.format(input_path, output_path))
    shutil.copy2(input_path, output_path + ext)
    logger.debug('Reading image using PIL')
    img = Image.open(input_path)
    with open(output_path + '.xml', 'w') as fp:
        logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size))
        tei = TEIFacsimile()
        tei.document(img.size, os.path.join(*doc))
        tei.title = os.path.basename(doc[1])
        tei.add_respstmt('kraken', 'page segmentation')
        for seg in pageseg.segment(img, black_colseps):
            logger.debug('Found line at {} {} {} {}'.format(*seg))
            tei.add_line(seg)
        logger.debug('Write segmentation to {}'.format(fp.abs_path))
        tei.write(fp)
    return (storage.get_storage_path(output_path + '.xml'),
            storage.get_storage_path(output_path + ext))

예제 #4

0

파일 보기

파일: tesseract.py 프로젝트: amitdo/nidaba

def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path), doc
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path), doc

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc))
    tei = TEIFacsimile()
    tei.document((w, h), os.path.join(*doc))
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it,
                                                  RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value,
                                                                          y0.value,
                                                                          x1.value,
                                                                          y1.value))

        tesseract.TessPageIteratorBoundingBox(it,
                                              RIL_SYMBOL,
                                              ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([(None, (x0.value, y0.value, x1.value, y1.value))])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value,
                                                                        y0.value,
                                                                        x1.value,
                                                                        y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path), doc

예제 #5

0

파일 보기

def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (
        doc[1][0],
        os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] +
        '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as seg:
        tei.read(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for rec in rpred.rpred(rnn, img,
                           [(int(x[0]), int(x[1]), int(x[2]), int(x[3]))
                            for x in lines]):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(lines[i][4]))
        tei.scope_line(lines[i][4])
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(segment)])
                logger.debug(
                    'Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(
                    rec.prediction[line_offset:line_offset + len(segment)]))
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(segment)]
                ])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(
                    rec.prediction[line_offset:line_offset + len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset +
                                             len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([
                    (x[0], x[1], int(x[2] * 100))
                    for x in rec[line_offset:line_offset + len(whitespace)]
                ])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path

예제 #6

0

파일 보기

def tei_metadata(doc, method=u'metadata', metadata=None, validate=False):
    """
    Enriches a TEI-XML document with various metadata from an user-supplied
    YAML file.

    The following fields may be contained in the metadata file with the bolded
    subset mandatory for a valid TEI-XML file. They are grouped by their place
    in the header. Unknown fields are ignored and input is escaped as to
    disable injection.

    Some element may also be extended by increasing their arity, the second
    value is then usually used as a global identifer/locator, i.e. an URL or
    authority control ID.

    titleStmt:

        * ``title``: Title of the resource
        * author: Name of the author of the resource (may be extended)
        * editor: Name of the editor, compiler, translator, etc. of the
                  resource (may be extended)
        * funder: Institution responsible for the funding of the text (may be
                  extended)
        * principal: PI responsible for the creation of the text (may be
                     extended)
        * sponsor: Name of the sponsoring institution (may be extended)
        * meeting: Conference/meeting resulting in the text (may be extended)

    editionStmt:

        * edition: Peculiarities to the underlying edition of the text

    publicationStmt:

        * ``licence``: Licence of the content (may be extended)
        * ``publisher``: Person or agency responsible for the publication of
                     the text (may be extended)
        * distributor: Person or agency responsible for the text's
                       distribution (may be extended)
        * authority: Authority responsible for making the work available
        * idno: Identifier of the publication (may be extended with the type of
                identifier)
        * pub_place: Place of publication
        * date: Date of publication

    seriesStmt:

        * series_title: Title of the series to which the publication belongs

    notesStmt:

        * note: Misc. notes about the text

    sourceDesc:

        * ``source_desc``: Description of the source document

    other:

        * lang: Abbreviation of the language used in the header

    There is a sample file from the OpenPhilology project in the example
    directory.

    Args:
        doc (unicode, unicode): Storage tuple of the input document
        method (unicode):
        metadata (unicode, unicode): Storage tuple of the metadata YAML file

    Returns:
        (unicode, unicode): Storage tuple of the output document

    Raises:
        NidabaTEIException if the resulting document is not TEI compatible and
        validation is enabled.
    """
    with storage.StorageFile(*doc) as fp:
        tei = TEIFacsimile()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.read(fp)
    logger.debug('Reading metadata file ({}/{})'.format(*metadata))
    with storage.StorageFile(*metadata) as fp:
        meta = yaml.safe_load(fp)
    for field in tei.fields:
        if field in meta:
            logger.debug('Adding field {} ({})'.format(field, meta[field]))
            setattr(tei, field, meta[field])
    if validate:
        raise NidabaTEIException('Validation not yet implemented.')
    output_path = storage.insert_suffix(doc[1], method, metadata[1])
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return (doc[0], output_path)

예제 #7

0

파일 보기

파일: ocropus.py 프로젝트: amitdo/nidaba

def ocr(image_path, segmentation_path, output_path, model_path):
    """
    Scan a single image with ocropus.

    Reads a single image file from ```imagepath``` and writes the recognized
    text as a TEI document into output_path.

    Args:
        image_path (unicode): Path of the input file
        segmentation_path (unicode): Path of the segmentation XML file.
        output_path (unicode): Path of the output file
        model_path (unicode): Path of the recognition model. Must be a pyrnn.gz
                             pickle dump interoperable with ocropus-rpred.

    Returns:
        (unicode): A string of the output file that is actually written. As
                   Ocropus rewrites output file paths without notice it may be
                   different from the ```outputfilepath``` argument.

    Raises:
        NidabaOcropusException: Ocropus somehow failed. The error output is
                                contained in the message but as it is de facto
                                unusable as a library it's impossible to deduct
                                the nature of the problem.
    """

    try:
        logger.debug('Loading pyrnn from {}'.format(model_path))
        network = ocrolib.load_object(model_path, verbose=0)
        lnorm = getattr(network, "lnorm")
    except Exception as e:
        raise NidabaOcropusException('Something somewhere broke: ' + e.msg)
    im = Image.open(image_path)

    logger.debug('Loading TEI segmentation {}'.format(segmentation_path))
    tei = TEIFacsimile()
    with open(segmentation_path, 'r') as seg_fp:
        tei.read(seg_fp)

    logger.debug('Clearing out word/grapheme boxes')
    # ocropus is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('ocropus', 'character recognition')
    for box in tei.lines:
        logger.debug('Recognizing line {}'.format(box[4]))
        ib = tuple(int(x) for x in box[:-2])
        line = ocrolib.pil2array(im.crop(ib))
        temp = np.amax(line) - line
        temp = temp * 1.0 / np.amax(temp)
        lnorm.measure(temp)
        line = lnorm.normalize(line, cval=np.amax(line))
        if line.ndim == 3:
            np.mean(line, 2)
        line = ocrolib.lstm.prepare_line(line, 16)
        pred = network.predictString(line)
        pred = ocrolib.normalize_text(pred)
        logger.debug('Scoping line {}'.format(box[4]))
        tei.scope_line(box[4])
        logger.debug('Adding graphemes: {}'.format(pred))
        tei.add_graphemes(pred)
    with open(output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path

예제 #8

0

파일 보기

파일: ocropus.py 프로젝트: mirskiy/nidaba

def ocr(image_path, segmentation_path, output_path, model_path):
    """
    Scan a single image with ocropus.

    Reads a single image file from ```imagepath``` and writes the recognized
    text as a TEI document into output_path.

    Args:
        image_path (unicode): Path of the input file
        segmentation_path (unicode): Path of the segmentation XML file.
        output_path (unicode): Path of the output file
        model_path (unicode): Path of the recognition model. Must be a pyrnn.gz
                             pickle dump interoperable with ocropus-rpred.

    Returns:
        (unicode): A string of the output file that is actually written. As
                   Ocropus rewrites output file paths without notice it may be
                   different from the ```outputfilepath``` argument.

    Raises:
        NidabaOcropusException: Ocropus somehow failed. The error output is
                                contained in the message but as it is de facto
                                unusable as a library it's impossible to deduct
                                the nature of the problem.
    """

    try:
        logger.debug('Loading pyrnn from {}'.format(model_path))
        network = ocrolib.load_object(model_path, verbose=0)
        lnorm = getattr(network, "lnorm")
    except Exception as e:
        raise NidabaOcropusException('Something somewhere broke: ' + e.msg)
    im = Image.open(image_path)

    logger.debug('Loading TEI segmentation {}'.format(segmentation_path))
    tei = TEIFacsimile()
    with open(segmentation_path, 'r') as seg_fp:
        tei.read(seg_fp)

    logger.debug('Clearing out word/grapheme boxes')
    # ocropus is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('ocropus', 'character recognition')
    for box in tei.lines:
        logger.debug('Recognizing line {}'.format(box[4]))
        ib = tuple(int(x) for x in box[:-2])
        line = ocrolib.pil2array(im.crop(ib))
        temp = np.amax(line) - line
        temp = temp * 1.0 / np.amax(temp)
        lnorm.measure(temp)
        line = lnorm.normalize(line, cval=np.amax(line))
        if line.ndim == 3:
            np.mean(line, 2)
        line = ocrolib.lstm.prepare_line(line, 16)
        pred = network.predictString(line)
        pred = ocrolib.normalize_text(pred)
        logger.debug('Scoping line {}'.format(box[4]))
        tei.scope_line(box[4])
        logger.debug('Adding graphemes: {}'.format(pred))
        tei.add_graphemes(pred)
    with open(output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path

예제 #9

0

파일 보기

파일: kraken.py 프로젝트: amitdo/nidaba

def ocr_kraken(doc, method=u'ocr_kraken', model=None):
    """
    Runs kraken on an input document and writes a TEI file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string append to all output files
        model (unicode): Identifier for the font model to use

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    input_path = storage.get_abs_path(*doc[1])
    output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1],
                                                                     method,
                                                                     model))[0]
                   + '.xml')
    logger.debug('Searching for model {}'.format(model))
    if model in nidaba_cfg['kraken_models']:
        model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model]))
    elif model in nidaba_cfg['ocropus_models']:
        model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model]))
    else:
        raise NidabaInvalidParameterException('Model not defined in '
                                              'configuration')
    img = Image.open(input_path)
    logger.debug('Reading TEI segmentation from {}'.format(doc[1]))
    tei = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as seg:
        tei.read(seg)

    logger.debug('Clearing out word/grapheme boxes')
    # kraken is a line recognizer
    tei.clear_graphemes()
    tei.clear_segments()
    # add and scope new responsibility statement
    tei.add_respstmt('kraken', 'character recognition')
    lines = tei.lines

    logger.debug('Loading model {}'.format(model))
    rnn = models.load_any(model)
    i = 0
    logger.debug('Start recognizing characters')
    for rec in rpred.rpred(rnn, img, [(int(x[0]), int(x[1]), int(x[2]), int(x[3])) for x in lines]):
        # scope the current line and add all graphemes recognized by kraken to
        # it.
        logger.debug('Scoping line {}'.format(lines[i][4]))
        tei.scope_line(lines[i][4])
        i += 1

        splits = regex.split(u'(\s+)', rec.prediction)
        line_offset = 0
        for segment, whitespace in izip_longest(splits[0::2], splits[1::2]):
            if len(segment):
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)])
                logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox))
                tei.add_segment(seg_bbox)
                logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)]))
                tei.add_graphemes([(x[0], x[1], int(x[2] * 100)) for x in rec[line_offset:line_offset+len(segment)]])
                line_offset += len(segment)
            if whitespace:
                logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)]))
                seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)])
                tei.add_segment(seg_bbox)
                tei.add_graphemes([(x[0], x[1], int(x[2] * 100)) for x in rec[line_offset:line_offset+len(whitespace)]])
                line_offset += len(whitespace)
    with storage.StorageFile(*output_path, mode='wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return output_path

예제 #10

0

파일 보기

파일: output.py 프로젝트: amitdo/nidaba

def tei_metadata(doc, method=u'metadata', metadata=None, validate=False):
    """
    Enriches a TEI-XML document with various metadata from an user-supplied
    YAML file.

    The following fields may be contained in the metadata file with the bolded
    subset mandatory for a valid TEI-XML file. They are grouped by their place
    in the header. Unknown fields are ignored and input is escaped as to
    disable injection.

    Some element may also be extended by increasing their arity, the second
    value is then usually used as a global identifer/locator, i.e. an URL or
    authority control ID.

    titleStmt:

        * ``title``: Title of the resource
        * author: Name of the author of the resource (may be extended)
        * editor: Name of the editor, compiler, translator, etc. of the
                  resource (may be extended)
        * funder: Institution responsible for the funding of the text (may be
                  extended)
        * principal: PI responsible for the creation of the text (may be
                     extended)
        * sponsor: Name of the sponsoring institution (may be extended)
        * meeting: Conference/meeting resulting in the text (may be extended)

    editionStmt:

        * edition: Peculiarities to the underlying edition of the text

    publicationStmt:

        * ``licence``: Licence of the content (may be extended)
        * ``publisher``: Person or agency responsible for the publication of
                     the text (may be extended)
        * distributor: Person or agency responsible for the text's
                       distribution (may be extended)
        * authority: Authority responsible for making the work available
        * idno: Identifier of the publication (may be extended with the type of
                identifier)
        * pub_place: Place of publication
        * date: Date of publication

    seriesStmt:

        * series_title: Title of the series to which the publication belongs

    notesStmt:

        * note: Misc. notes about the text

    sourceDesc:

        * ``source_desc``: Description of the source document

    other:

        * lang: Abbreviation of the language used in the header

    There is a sample file from the OpenPhilology project in the example
    directory.

    Args:
        doc (unicode, unicode): Storage tuple of the input document
        method (unicode):
        metadata (unicode, unicode): Storage tuple of the metadata YAML file

    Returns:
        (unicode, unicode): Storage tuple of the output document

    Raises:
        NidabaTEIException if the resulting document is not TEI compatible and
        validation is enabled.
    """
    with storage.StorageFile(*doc) as fp:
        tei = TEIFacsimile()
        logger.debug('Reading TEI ({}/{})'.format(*doc))
        tei.read(fp)
    logger.debug('Reading metadata file ({}/{})'.format(*metadata))
    with storage.StorageFile(*metadata) as fp:
        meta = yaml.safe_load(fp)
    for field in tei.fields:
        if field in meta:
            logger.debug('Adding field {} ({})'.format(field, meta[field]))
            setattr(tei, field, meta[field])
    if validate:
        raise NidabaTEIException('Validation not yet implemented.')
    output_path = storage.insert_suffix(doc[1], method, metadata[1])
    with storage.StorageFile(doc[0], output_path, 'wb') as fp:
        logger.debug('Writing TEI to {}'.format(fp.abs_path))
        tei.write(fp)
    return (doc[0], output_path)

예제 #11

0

파일 보기

def ocr_tesseract(doc,
                  method=u'ocr_tesseract',
                  languages=None,
                  extended=False):
    """
    Runs tesseract on an input document.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files
        languages (list): A list of tesseract classifier identifiers
        extended (bool): Switch to enable extended hOCR generation containing
                         character cuts and confidences. Has no effect when
                         direct or legacy implementation is used.

    Returns:
        (unicode, unicode): Storage tuple for the output file
    """
    image_path = storage.get_abs_path(*doc[1])

    # rewrite the segmentation file to lines in UZN format
    logger.debug('Rewriting TEI ({}) -> UZN ({})'.format(
        doc[0][1],
        splitext(doc[1][1])[0] + '.uzn'))
    seg = TEIFacsimile()
    with storage.StorageFile(*doc[0]) as fp:
        seg.read(fp)
    with storage.StorageFile(doc[1][0],
                             splitext(doc[1][1])[0] + '.uzn',
                             mode='wb') as fp:
        uzn = UZNWriter(fp)
        for line in seg.lines:
            uzn.writerow(*line[:4])

    if isinstance(languages, basestring):
        languages = [languages]
    output_path = storage.insert_suffix(image_path, method, *languages)

    logger.debug(
        'Invoking tesseract with {} call method'.format(implementation))
    if implementation == 'legacy':
        result_path = output_path + '.html'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'direct':
        result_path = output_path + '.hocr'
        ocr_direct(image_path, output_path, languages)
    elif implementation == 'capi':
        result_path = output_path + '.xml'
        ocr_capi(image_path, result_path, seg, languages, extended)
    else:
        raise NidabaTesseractException('Invalid implementation selected',
                                       implementation)

    if not result_path[-4:] == '.xml':
        logger.debug('Converting hOCR ({}) -> TEI ({})'.format(
            result_path, output_path + '.xml'))
        tei = TEIFacsimile()
        with open(result_path) as fp:
            tei.load_hocr(fp)
        os.unlink(result_path)
        with open(output_path + '.xml', 'wb') as fp:
            tei.write(fp)
        result_path = output_path + '.xml'
    return storage.get_storage_path(result_path)

예제 #12

0

파일 보기

def segmentation_tesseract(doc, method=u'segment_tesseract'):
    """
    Performs page segmentation using tesseract's built-in algorithm and writes
    a TEI XML segmentation file.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        Two storage tuples with the first one containing the segmentation and
        the second one being the file the segmentation was calculated upon.
    """
    input_path = storage.get_abs_path(*doc)
    output_path = splitext(storage.insert_suffix(input_path,
                                                 method))[0] + '.xml'

    ver = tesseract.TessVersion()
    if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2:
        raise NidabaTesseractException('libtesseract version is too old. Set '
                                       'implementation to direct.')

    # tesseract has a tendency to crash arbitrarily on some inputs
    # necessitating execution in a separate process to ensure the worker
    # doesn't just die. We use fork as the multiprocessing module thinks
    # programmers are too stupid to reap their children.
    logger.info('Forking before entering unstable ctypes code')
    pid = os.fork()
    if pid != 0:
        try:
            logger.info('Waiting for child to complete')
            _, status = os.waitpid(pid, 0)
        except OSError as e:
            if e.errno not in (errno.EINTR, errno.ECHILD):
                raise
            return storage.get_storage_path(output_path), doc
        if os.WIFSIGNALED(status):
            raise NidabaTesseractException(
                'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status)))
        return storage.get_storage_path(output_path), doc

    api = tesseract.TessBaseAPICreate()
    rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None)
    if (rc):
        tesseract.TessBaseAPIDelete(api)
        raise NidabaTesseractException('Tesseract initialization failed.')

    # only do segmentation and script detection
    logger.debug('Setting page set mode to 2')
    tesseract.TessBaseAPISetPageSegMode(api, 2)

    logger.debug('Reading {} using leptonica'.format(input_path))
    pix = leptonica.pixRead(input_path.encode('utf-8'))
    logger.debug('Setting PIX as input image')
    tesseract.TessBaseAPISetImage2(api, pix)
    logger.debug('Analyzing page layout')
    it = tesseract.TessBaseAPIAnalyseLayout(api)
    logger.debug('Destroying PIX')
    leptonica.pixDestroy(ctypes.byref(pix))
    x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(),
                      ctypes.c_int())

    w, h = Image.open(input_path).size
    logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(
        w, h, *doc))
    tei = TEIFacsimile()
    tei.document((w, h), os.path.join(*doc))
    tei.title = os.path.basename(doc[1])
    tei.add_respstmt('tesseract', 'page segmentation')

    while True:
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE):
            tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_line((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new line at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))
        if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD):
            tesseract.TessPageIteratorBoundingBox(it, RIL_WORD,
                                                  ctypes.byref(x0),
                                                  ctypes.byref(y0),
                                                  ctypes.byref(x1),
                                                  ctypes.byref(y1))
            tei.add_segment((x0.value, y0.value, x1.value, y1.value))
            logger.debug('Segmenter found new word at {} {} {} {}'.format(
                x0.value, y0.value, x1.value, y1.value))

        tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0),
                                              ctypes.byref(y0),
                                              ctypes.byref(x1),
                                              ctypes.byref(y1))
        tei.add_graphemes([(None, (x0.value, y0.value, x1.value, y1.value))])
        logger.debug('Segmenter found new symbol at {} {} {} {}'.format(
            x0.value, y0.value, x1.value, y1.value))
        if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL):
            logger.debug('No more elements on page')
            break
    logger.debug('Deleting page iterator and base API')
    tesseract.TessPageIteratorDelete(it)
    tesseract.TessBaseAPIEnd(api)
    tesseract.TessBaseAPIDelete(api)
    logger.info('Writing segmentation to {}'.format(output_path))
    with open(output_path, 'w') as fp:
        tei.write(fp)
    logger.info('Quitting child process')
    os._exit(os.EX_OK)
    return storage.get_storage_path(output_path), doc