def test_ocr_if_less(self): text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_ALWAYS, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertEqual(2, len(rst.labels['images'])) text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0) parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.NEVER_STORE, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0)
def test_estimate_dense_text(self): text = load_resource_document('parsing/pdf_malformat_parsed_default.txt', 'utf-8') estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertGreater(estim.extra_line_breaks_prob, 50) text = load_resource_document('parsing/pdf_malformat_parsed_stripper.txt', 'utf-8') estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 30)
def test_estimate_text_abusing_headers(self): text = load_resource_document('parsing/text_abusing_headers.txt', 'utf-8') text = pre_process_document(text) estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 50)
def test_complex_mixed_pdf(self): sets = XhtmlParsingSettings() sets.ocr_sets = OcrTextStoreSettings.STORE_ALWAYS full_text = load_resource_document('parsing/parsed_mixed_pdf.xhtml', encoding='utf-8') parser = TikaXhtmlParser(sets) markup = parser.parse_text(full_text) markup.convert_markers_to_labels() proc_text = markup.text self.assertEqual(-1, proc_text.find('##')) pages = markup.labels['pages'] self.assertGreater(len(pages), 100) pages_texts = [] for _start, end in pages: in_end = min(end, len(markup.text)) in_start = max(in_end - 50, 0) ending = markup.text[in_start:in_end] pages_texts.append(ending) self.assertTrue('See “RATINGS” herein.' in pages_texts[0]) self.assertTrue( 'optional redemption date of November 15, 2027.' in pages_texts[1]) self.assertTrue('by the IRS.' in pages_texts[54])
def test_parse_vector_pdf(self): text = load_resource_document('parsing/xhtml_pdf.xhtml', encoding='utf-8') parser = TikaXhtmlParser() rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertGreater(len(rst.labels['pages']), 1) self.assertGreater(len(rst.labels['paragraphs']), 5)
def test_migrate_back(self): jsn = load_resource_document('scheme_migrations/doc_type_v_17.json', encoding='utf-8') sm = SchemeMigration() migrated = sm.migrate_json(jsn, 76, 65) self.assertGreater(len(migrated), 1000) self.assertNotEqual(len(jsn), len(migrated))
def test_ocr_empty_images(self): text = load_resource_document('parsing/xhtml_ocr_emptyimages.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertEqual(len(rst.text), rst.markers_extra_text_length)
def test_find_better_titles(self): full_text = load_resource_document('parsing/heading_document.txt') sections_txt = load_resource_document( 'parsing/heading_doc_sections.txt') sections = json.loads(sections_txt) section_titles = [s['title'] for s in sections] sentences_txt = load_resource_document( 'parsing/heading_doc_sentences.txt') sentence_coords = json.loads(sentences_txt) sentences = [] for row in sentence_coords: sentence = TextUnit() sentence.location_start = int(row[0]) sentence.location_end = int(row[1]) sentences.append(sentence) LoadDocuments.find_section_titles(sections, [], sentences, full_text) new_section_titles = [s['title'] for s in sections] self.assertNotEqual(section_titles[1], new_section_titles[1])
def test_ocr_little_text_scanned(self): text = load_resource_document('parsing/xhtml_ocr_mixed_long.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertGreater(len(rst.text), 100) self.assertEqual(2, len(rst.labels['images'])) len_with_ocred = len(rst.text) text = load_resource_document('parsing/xhtml_ocr_mixed_short.xhtml', encoding='utf-8') parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings( ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT, ocr_vector_text_min_length=100)) rst = parser.parse_text(text) self.assertTrue('images' not in rst.labels or len(rst.labels['images']) == 0) len_wo_ocred = len(rst.text) self.assertGreater(len_with_ocred - len_wo_ocred, 100)
def test_migrate_forward(self): jsn = load_resource_document('scheme_migrations/doc_type_v_16.json', encoding='utf-8') jsn = json.dumps(json.loads(jsn)['data']) sm = SchemeMigration() migrated = sm.migrate_json(jsn, 65, 76) self.assertGreater(len(migrated), 1000) self.assertNotEqual(len(jsn), len(migrated)) data = json.loads(migrated) dfc = [ d for d in data if d['model'] == 'document.documentfieldcategory' ] self.assertTrue(all(['document_type' in d['fields'] for d in dfc]))
def test_estimate_dense_text(self): text = load_resource_document( 'parsing/pdf_malformat_parsed_default.txt', 'utf-8') corrector = ParsedTextCorrector() corr = corrector.correct_line_breaks(text) self.assertLess(len(corr), len(text))