예제 #1
0
    def test_file_4_use_ml(self):
        text = self.get_text('test_get_section_spans_1.txt')

        # test all sections
        sections = list(get_section_spans(text))
        self.assertEqual(len(sections), 207)

        # test only sections with titles
        sections = list(get_section_spans(text, skip_empty_headers=True))
        self.assertEqual(len([i for i in sections if i['title'] is None]), 0)

        self.assertDictEqual(
            sections[1], {
                'start':
                2280,
                'end':
                2340,
                'title':
                'SECTION 2',
                'title_start':
                2280,
                'title_end':
                2289,
                'level':
                1,
                'abs_level':
                3,
                'text':
                'SECTION 2.  Letters of Credit........................... 15\n'
            })
    def test_file_4_use_ml(self):
        text = self.get_text('test_get_section_spans_1.txt')

        # test all sections
        sections = list(get_section_spans(text))
        print(f'{len(sections)} sections are found')
        for s in sections:
            print(f'Section #{s.start}, "{s.title}"')
        self.assertEqual(len(sections), 207)

        # test only sections with titles
        sections = list(get_section_spans(text, skip_empty_headers=True))
        self.assertEqual(len([i for i in sections if i.title is None]), 0)

        self.assertEqual(
            sections[1],
            DocumentSection(
                start=2280,
                end=2340,
                title='SECTION 2',
                title_start=2280,
                title_end=2289,
                level=1,
                abs_level=3,
                text=
                'SECTION 2.  Letters of Credit........................... 15\n'
            ))
예제 #3
0
    def test_file_4_use_regex(self):
        text = self.get_text('test_get_section_spans_1.txt')

        # test all sections
        sections = list(get_section_spans(text, use_ml=False))
        self.assertEqual(len(sections), 554)

        self.assertDictEqual(
            sections[2], {
                'start':
                1378,
                'end':
                1438,
                'title':
                'SECTION 1',
                'title_start':
                1378,
                'title_end':
                1387,
                'level':
                2,
                'abs_level':
                3,
                'text':
                'SECTION 1.  Amount and Terms of Credit..................  1\n'
            })
 def get_error(self):
     file_count = {
         '1582586_2015-08-31': 23,
         'test_get_section_spans_1.txt': 207
     }
     sum_delta = 0
     for file in file_count:
         text = self.get_text(file)
         count = len(list(get_section_spans(text, use_ml=True)))
         delta = (count - file_count[file]) / file_count[file]
         sum_delta += delta * delta
     return sum_delta
    def test_file_4_use_regex(self):
        text = self.get_text('test_get_section_spans_1.txt')

        # test all sections
        sections = list(get_section_spans(text, use_ml=False))
        self.assertEqual(len(sections), 554)

        self.assertEqual(
            sections[2],
            DocumentSection(
                start=1378,
                end=1438,
                title='SECTION 1',
                title_start=1378,
                title_end=1387,
                level=2,
                abs_level=3,
                text=
                'SECTION 1.  Amount and Terms of Credit..................  1\n'
            ))
    def test_title_start_end(self):
        text = self.get_text(
            'lexnlp/nlp/en/tests/test_sections/skewed_document.txt')
        sentence_spans = get_sentence_span_list(text)
        sections = list(
            get_section_spans(text,
                              use_ml=False,
                              return_text=False,
                              skip_empty_headers=True))
        self.assertGreater(len(sections), 3)
        # test title coordinates before enhancing titles ...
        for sect in sections:
            title = text[sect.title_start:sect.title_end]
            self.assertEqual(sect.title, title)

        # ... and after enhancing
        find_section_titles(sections, sentence_spans, text)
        for sect in sections:
            title = text[sect.title_start:sect.title_end]
            self.assertEqual(sect.title, title)
예제 #7
0
 def test_bad_text(self):
     text = 'text'
     sections = list(get_section_spans(text))
     self.assertEqual(sections, [])