def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search( tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return { 'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc }
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True): """ Extracts self reported confidence values from input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()]) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edist)) return output_path else: return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True): """ Extracts self reported confidence values from input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()]) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edist)) return output_path else: return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return { 'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc }
def merge(docs, lang, output): """ Merges multiple hOCR documents into a single one. First bboxes from all documents are roughly matched, then all matching bboxes are scored using a spell checker. If no spell checker is available all matches will be merged without ranking. The matching is naive, i.e. we just grab the first input document and assume that all other documents have similar segmentation results. Issues like high variance in segmentation, especially word boundaries are not accounted for. Args: docs (iterable): A list of storage tuples of input documents lang (unicode): A language identifier for the spell checker output (tuple): Storage tuple for the result Returns: tuple: The output storage tuple. Should be the same as ```output```. """ tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1])) lines_1, words_1 = get_hocr_lines_for_tree(tree1) sort_words_bbox(words_1) other_words = [] for doc in docs[1:]: try: tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1])) lines_2, words_2 = get_hocr_lines_for_tree(tree2) other_words = other_words + words_2 except Exception as e: print e sort_words_bbox(other_words) positional_lists = [] positional_list = [] x = 0 # Make a list of positional_lists, that is alternatives for a given # position, skipping duplicate position-words while x < len(other_words): try: if len(positional_list) == 0: positional_list.append(other_words[x]) else: if close_enough(other_words[x - 1].bbox, other_words[x].bbox): # skip if the text is the same, so that we just get unique # texts for this position if not other_words[x - 1].text == other_words[x].text: positional_list.append(other_words[x]) else: if not x == 0: positional_lists.append(positional_list) positional_list = [] except IndexError: pass x = x + 1 # we now have a list of list of unique words for each position # let's select from each the first one that passes spellcheck replacement_words = [] # make a 'replacement_words' list with all of the best, non-zero-scoring # suggestions for each place for positional_list in positional_lists: for word in positional_list: word.score = score_word(lang, word.text) positional_list.sort(key=attrgetter('score'), reverse=True) if positional_list[0].score > 0: replacement_words.append(positional_list[0]) # now replace the originals for word in words_1: for replacement_word in replacement_words: word.score = score_word(lang, word.text) if close_enough(word.bbox, replacement_word.bbox) and ( word.score < replacement_word.score): word.element.text = replacement_word.text for positional_list in positional_lists: print "##" for word in positional_list: print word.bbox, word.text storage.write_text(*output, text=etree.tostring(tree1.getroot(), encoding='unicode')) return output
def merge(docs, lang, output): """ Merges multiple hOCR documents into a single one. First bboxes from all documents are roughly matched, then all matching bboxes are scored using a spell checker. If no spell checker is available all matches will be merged without ranking. The matching is naive, i.e. we just grab the first input document and assume that all other documents have similar segmentation results. Issues like high variance in segmentation, especially word boundaries are not accounted for. Args: docs (iterable): A list of storage tuples of input documents lang (unicode): A language identifier for the spell checker output (tuple): Storage tuple for the result Returns: tuple: The output storage tuple. Should be the same as ```output```. """ parser = etree.HTMLParser() tree1 = etree.parse(storage.get_abs_path(docs[0][0], docs[0][1]), parser) lines_1, words_1 = get_hocr_lines_for_tree(tree1) sort_words_bbox(words_1) other_words = [] for doc in docs[1:]: try: tree2 = etree.parse(storage.get_abs_path(doc[0], doc[1]), parser) lines_2, words_2 = get_hocr_lines_for_tree(tree2) other_words = other_words + words_2 except Exception as e: print(e) sort_words_bbox(other_words) positional_lists = [] positional_list = [] x = 0 # Make a list of positional_lists, that is alternatives for a given # position, skipping duplicate position-words while x < len(other_words): try: if len(positional_list) == 0: positional_list.append(other_words[x]) else: if close_enough(other_words[x - 1].bbox, other_words[x].bbox): # skip if the text is the same, so that we just get unique # texts for this position if not other_words[x - 1].text == other_words[x].text: positional_list.append(other_words[x]) else: if not x == 0: positional_lists.append(positional_list) positional_list = [] except IndexError: pass x = x + 1 # we now have a list of list of unique words for each position # let's select from each the first one that passes spellcheck replacement_words = [] # make a 'replacement_words' list with all of the best, non-zero-scoring # suggestions for each place for positional_list in positional_lists: for word in positional_list: word.score = score_word(lang, word.text) positional_list.sort(key=attrgetter('score'), reverse=True) if positional_list[0].score > 0: replacement_words.append(positional_list[0]) # now replace the originals for word in words_1: for replacement_word in replacement_words: word.score = score_word(lang, word.text) if close_enough(word.bbox, replacement_word.bbox) and ( word.score < replacement_word.score): word.element.text = replacement_word.text for positional_list in positional_lists: print("##") for word in positional_list: print(word.bbox, word.text) storage.write_text(*output, text=etree.tostring(tree1.getroot(), encoding='unicode')) return output