def test_ocr_if_less(self):
        text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_ALWAYS,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertEqual(2, len(rst.labels['images']))

        text = load_resource_document('parsing/xhtml_ocr_mixed.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_NO_OTHER_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)

        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.NEVER_STORE,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)
Пример #2
0
    def test_estimate_dense_text(self):
        text = load_resource_document('parsing/pdf_malformat_parsed_default.txt', 'utf-8')
        estimator = ParsedTextQualityEstimator()
        estim = estimator.estimate_text(text)
        self.assertGreater(estim.extra_line_breaks_prob, 50)

        text = load_resource_document('parsing/pdf_malformat_parsed_stripper.txt', 'utf-8')
        estim = estimator.estimate_text(text)
        self.assertLess(estim.extra_line_breaks_prob, 30)
 def test_estimate_text_abusing_headers(self):
     text = load_resource_document('parsing/text_abusing_headers.txt',
                                   'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)
Пример #4
0
    def test_complex_mixed_pdf(self):
        sets = XhtmlParsingSettings()
        sets.ocr_sets = OcrTextStoreSettings.STORE_ALWAYS
        full_text = load_resource_document('parsing/parsed_mixed_pdf.xhtml',
                                           encoding='utf-8')
        parser = TikaXhtmlParser(sets)
        markup = parser.parse_text(full_text)
        markup.convert_markers_to_labels()

        proc_text = markup.text
        self.assertEqual(-1, proc_text.find('##'))
        pages = markup.labels['pages']
        self.assertGreater(len(pages), 100)

        pages_texts = []
        for _start, end in pages:
            in_end = min(end, len(markup.text))
            in_start = max(in_end - 50, 0)
            ending = markup.text[in_start:in_end]
            pages_texts.append(ending)

        self.assertTrue('See “RATINGS” herein.' in pages_texts[0])
        self.assertTrue(
            'optional redemption date of November 15, 2027.' in pages_texts[1])
        self.assertTrue('by the IRS.' in pages_texts[54])
Пример #5
0
 def test_parse_vector_pdf(self):
     text = load_resource_document('parsing/xhtml_pdf.xhtml', encoding='utf-8')
     parser = TikaXhtmlParser()
     rst = parser.parse_text(text)
     self.assertGreater(len(rst.text), 100)
     self.assertGreater(len(rst.labels['pages']), 1)
     self.assertGreater(len(rst.labels['paragraphs']), 5)
 def test_migrate_back(self):
     jsn = load_resource_document('scheme_migrations/doc_type_v_17.json',
                                  encoding='utf-8')
     sm = SchemeMigration()
     migrated = sm.migrate_json(jsn, 76, 65)
     self.assertGreater(len(migrated), 1000)
     self.assertNotEqual(len(jsn), len(migrated))
 def test_ocr_empty_images(self):
     text = load_resource_document('parsing/xhtml_ocr_emptyimages.xhtml',
                                   encoding='utf-8')
     parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
         ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
         ocr_vector_text_min_length=100))
     rst = parser.parse_text(text)
     self.assertEqual(len(rst.text), rst.markers_extra_text_length)
    def test_find_better_titles(self):
        full_text = load_resource_document('parsing/heading_document.txt')
        sections_txt = load_resource_document(
            'parsing/heading_doc_sections.txt')
        sections = json.loads(sections_txt)
        section_titles = [s['title'] for s in sections]

        sentences_txt = load_resource_document(
            'parsing/heading_doc_sentences.txt')
        sentence_coords = json.loads(sentences_txt)

        sentences = []
        for row in sentence_coords:
            sentence = TextUnit()
            sentence.location_start = int(row[0])
            sentence.location_end = int(row[1])
            sentences.append(sentence)

        LoadDocuments.find_section_titles(sections, [], sentences, full_text)
        new_section_titles = [s['title'] for s in sections]
        self.assertNotEqual(section_titles[1], new_section_titles[1])
    def test_ocr_little_text_scanned(self):
        text = load_resource_document('parsing/xhtml_ocr_mixed_long.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertGreater(len(rst.text), 100)
        self.assertEqual(2, len(rst.labels['images']))
        len_with_ocred = len(rst.text)

        text = load_resource_document('parsing/xhtml_ocr_mixed_short.xhtml',
                                      encoding='utf-8')
        parser = TikaXhtmlParser(pars_settings=XhtmlParsingSettings(
            ocr_sets=OcrTextStoreSettings.STORE_IF_MORE_TEXT,
            ocr_vector_text_min_length=100))
        rst = parser.parse_text(text)
        self.assertTrue('images' not in rst.labels
                        or len(rst.labels['images']) == 0)
        len_wo_ocred = len(rst.text)

        self.assertGreater(len_with_ocred - len_wo_ocred, 100)
    def test_migrate_forward(self):
        jsn = load_resource_document('scheme_migrations/doc_type_v_16.json',
                                     encoding='utf-8')
        jsn = json.dumps(json.loads(jsn)['data'])
        sm = SchemeMigration()
        migrated = sm.migrate_json(jsn, 65, 76)
        self.assertGreater(len(migrated), 1000)
        self.assertNotEqual(len(jsn), len(migrated))

        data = json.loads(migrated)
        dfc = [
            d for d in data if d['model'] == 'document.documentfieldcategory'
        ]
        self.assertTrue(all(['document_type' in d['fields'] for d in dfc]))
Пример #11
0
 def test_estimate_dense_text(self):
     text = load_resource_document(
         'parsing/pdf_malformat_parsed_default.txt', 'utf-8')
     corrector = ParsedTextCorrector()
     corr = corrector.correct_line_breaks(text)
     self.assertLess(len(corr), len(text))