Exemplo n.º 1
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
Exemplo n.º 2
0
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True):
    """
    Calculates the lexicality of text in input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    dictionary = storage.get_abs_path(
        *nidaba_cfg['lang_dicts'][language]['dictionary'])
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    cnt = 0
    err_cnt = 0
    for seg_id, segment in facsimile.segments.iteritems():
        tok = alg.sanitize(''.join(x['grapheme']
                                   for x in segment['content'].itervalues()))
        tok = regex.sub('[^\w]', '', key)
        cnt += 1
        if not alg.mmap_bin_search(
                tok, dictionary, entryparser_fn=alg.key_for_single_word):
            err_cnt += 1
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(err_cnt / float(cnt)))
        return output_path
    else:
        return {
            'edit_ratio': err_cnt / float(cnt),
            'ground_truth': '',
            'doc': doc
        }
Exemplo n.º 3
0
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True):
    """
    Extracts self reported confidence values from input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()])
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edist))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
Exemplo n.º 4
0
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True):
    """
    Extracts self reported confidence values from input documents.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    with storage.StorageFile(*doc) as fp:
        tei = OCRRecord()
        tei.load_tei(fp)
    edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()])
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edist))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
Exemplo n.º 5
0
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None,
                    xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
Exemplo n.º 6
0
def text_diff_ratio(doc,
                    method=u'text_diff_ratio',
                    ground_truth=None,
                    xml_in=True,
                    gt_format=u'tei',
                    clean_in=True,
                    clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the algorithm of python's difflib SequenceMatcher. The result is a
    value between 0.0 (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format +
                                                  ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    sm = difflib.SequenceMatcher()
    sm.set_seqs(text, gt)
    logger.debug('Accuracy: {}'.format(sm.ratio()))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(sm.ratio()))
        return output_path
    else:
        return {
            'diff_ratio': sm.ratio(),
            'ground_truth': ground_truth,
            'doc': doc
        }
Exemplo n.º 7
0
def merge(docs, lang, output):
    """
    Merges multiple hOCR documents into a single one.

    First bboxes from all documents are roughly matched, then all matching
    bboxes are scored using a spell checker. If no spell checker is available
    all matches will be merged without ranking.

    The matching is naive, i.e. we just grab the first input document and
    assume that all other documents have similar segmentation results. Issues
    like high variance in segmentation, especially word boundaries are not
    accounted for.

    Args:
        docs (iterable): A list of storage tuples of input documents
        lang (unicode): A language identifier for the spell checker
        output (tuple): Storage tuple for the result

    Returns:
        tuple: The output storage tuple. Should be the same as ```output```.
    """
    tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1]))
    lines_1, words_1 = get_hocr_lines_for_tree(tree1)
    sort_words_bbox(words_1)
    other_words = []
    for doc in docs[1:]:
        try:
            tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1]))
            lines_2, words_2 = get_hocr_lines_for_tree(tree2)
            other_words = other_words + words_2
        except Exception as e:
            print e

    sort_words_bbox(other_words)
    positional_lists = []
    positional_list = []
    x = 0

    # Make a list of positional_lists, that is alternatives for a given
    # position, skipping duplicate position-words
    while x < len(other_words):
        try:
            if len(positional_list) == 0:
                positional_list.append(other_words[x])
            else:
                if close_enough(other_words[x - 1].bbox, other_words[x].bbox):
                    # skip if the text is the same, so that we just get unique
                    # texts for this position
                    if not other_words[x - 1].text == other_words[x].text:
                        positional_list.append(other_words[x])
                else:
                    if not x == 0:
                        positional_lists.append(positional_list)
                        positional_list = []
        except IndexError:
            pass
        x = x + 1

    # we now have a list of list of unique words for each position
    # let's select from each the first one that passes spellcheck
    replacement_words = []

    # make a 'replacement_words' list with all of the best, non-zero-scoring
    # suggestions for each place
    for positional_list in positional_lists:
        for word in positional_list:
            word.score = score_word(lang, word.text)
        positional_list.sort(key=attrgetter('score'), reverse=True)
        if positional_list[0].score > 0:
            replacement_words.append(positional_list[0])

    # now replace the originals
    for word in words_1:
        for replacement_word in replacement_words:
            word.score = score_word(lang, word.text)
            if close_enough(word.bbox, replacement_word.bbox) and (
                    word.score < replacement_word.score):
                word.element.text = replacement_word.text

        for positional_list in positional_lists:
            print "##"
            for word in positional_list:
                print word.bbox, word.text

    storage.write_text(*output, text=etree.tostring(tree1.getroot(),
                                                    encoding='unicode'))
    return output
Exemplo n.º 8
0
def merge(docs, lang, output):
    """
    Merges multiple hOCR documents into a single one.

    First bboxes from all documents are roughly matched, then all matching
    bboxes are scored using a spell checker. If no spell checker is available
    all matches will be merged without ranking.

    The matching is naive, i.e. we just grab the first input document and
    assume that all other documents have similar segmentation results. Issues
    like high variance in segmentation, especially word boundaries are not
    accounted for.

    Args:
        docs (iterable): A list of storage tuples of input documents
        lang (unicode): A language identifier for the spell checker
        output (tuple): Storage tuple for the result

    Returns:
        tuple: The output storage tuple. Should be the same as ```output```.
    """
    parser = etree.HTMLParser()
    tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1]), parser)
    lines_1, words_1 = get_hocr_lines_for_tree(tree1)
    sort_words_bbox(words_1)
    other_words = []
    for doc in docs[1:]:
        try:
            tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1]), parser)
            lines_2, words_2 = get_hocr_lines_for_tree(tree2)
            other_words = other_words + words_2
        except Exception as e:
            print(e)

    sort_words_bbox(other_words)
    positional_lists = []
    positional_list = []
    x = 0

    # Make a list of positional_lists, that is alternatives for a given
    # position, skipping duplicate position-words
    while x < len(other_words):
        try:
            if len(positional_list) == 0:
                positional_list.append(other_words[x])
            else:
                if close_enough(other_words[x - 1].bbox, other_words[x].bbox):
                    # skip if the text is the same, so that we just get unique
                    # texts for this position
                    if not other_words[x - 1].text == other_words[x].text:
                        positional_list.append(other_words[x])
                else:
                    if not x == 0:
                        positional_lists.append(positional_list)
                        positional_list = []
        except IndexError:
            pass
        x = x + 1

    # we now have a list of list of unique words for each position
    # let's select from each the first one that passes spellcheck
    replacement_words = []

    # make a 'replacement_words' list with all of the best, non-zero-scoring
    # suggestions for each place
    for positional_list in positional_lists:
        for word in positional_list:
            word.score = score_word(lang, word.text)
        positional_list.sort(key=attrgetter('score'), reverse=True)
        if positional_list[0].score > 0:
            replacement_words.append(positional_list[0])

    # now replace the originals
    for word in words_1:
        for replacement_word in replacement_words:
            word.score = score_word(lang, word.text)
            if close_enough(word.bbox, replacement_word.bbox) and (
                    word.score < replacement_word.score):
                word.element.text = replacement_word.text

        for positional_list in positional_lists:
            print("##")
            for word in positional_list:
                print(word.bbox, word.text)

    storage.write_text(*output,
                       text=etree.tostring(tree1.getroot(),
                                           encoding='unicode'))
    return output