Пример #1
0
    def test_should_annotate_other_tags_as_body(self):
        doc = _simple_document_with_tagged_token_lines(lines=[[(None,
                                                                TOKEN_1)]])

        SegmentationAnnotator(DEFAULT_CONFIG).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.BODY, TOKEN_1)
        ]]
Пример #2
0
    def test_should_clear_minority_among_untagged_tag(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[(None, TOKEN_1), (None, TOKEN_2), (OTHER_TAG, TOKEN_3)]])

        SegmentationAnnotator(DEFAULT_CONFIG, preserve_tags=True).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[(None, TOKEN_1),
                                                          (None, TOKEN_2),
                                                          (None, TOKEN_3)]]
Пример #3
0
    def test_should_annotate_reference_as_reference(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[(BackTagNames.REFERENCE, TOKEN_1)]])

        SegmentationAnnotator(DEFAULT_CONFIG).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.REFERENCE, TOKEN_1)
        ]]
Пример #4
0
    def test_should_annotate_title_as_front(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[(FrontTagNames.TITLE, TOKEN_1)]])

        SegmentationAnnotator(DEFAULT_CONFIG).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.FRONT, TOKEN_1)
        ]]
Пример #5
0
    def test_should_annotate_no_tag_as_body_if_preserve_is_disabled(self):
        doc = _simple_document_with_tagged_token_lines(lines=[[(None,
                                                                TOKEN_1)]])

        SegmentationAnnotator(DEFAULT_CONFIG,
                              preserve_tags=False).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.BODY, TOKEN_1)
        ]]
Пример #6
0
    def test_should_not_annotate_untagged_lines_after_last_header(self):
        doc = _simple_document_with_tagged_token_lines(lines=[
            [(FrontTagNames.TITLE, TOKEN_1)],
            [(FrontTagNames.TITLE, TOKEN_2)],
            [(None, TOKEN_3)],
        ])

        SegmentationAnnotator(DEFAULT_CONFIG, preserve_tags=True).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.FRONT, TOKEN_1)
        ], [(SegmentationTagNames.FRONT, TOKEN_2)], [(None, TOKEN_3)]]
Пример #7
0
    def test_should_ignore_front_if_start_line_index_beyond_threshold(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[(None,
                     TOKEN_1)], [(None,
                                  TOKEN_2)], [(FrontTagNames.TITLE, TOKEN_3)]])

        config = SegmentationConfig(DEFAULT_CONFIG.segmentation_mapping,
                                    front_max_start_line_index=1)
        SegmentationAnnotator(config, preserve_tags=True).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[(None, TOKEN_1)],
                                                         [(None, TOKEN_2)],
                                                         [(None, TOKEN_3)]]
Пример #8
0
    def test_should_annotate_line_with_using_common_tag(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[(FrontTagNames.TITLE,
                     TOKEN_1), (FrontTagNames.TITLE,
                                TOKEN_2), (OTHER_TAG, TOKEN_3)]])

        SegmentationAnnotator(DEFAULT_CONFIG).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.FRONT, TOKEN_1),
            (SegmentationTagNames.FRONT, TOKEN_2),
            (SegmentationTagNames.FRONT, TOKEN_3)
        ]]
Пример #9
0
    def test_should_merge_and_fill_remaining_untagged_with_annex(self):
        doc = _simple_document_with_tagged_token_lines(lines=[[(
            add_tag_prefix(BackTagNames.APPENDIX, prefix=B_TAG_PREFIX), TOKEN_1
        ), (add_tag_prefix(BackTagNames.APPENDIX, prefix=I_TAG_PREFIX),
            TOKEN_2)], [(None, TOKEN_3), (None, TOKEN_4)]])

        SegmentationAnnotator(
            DEFAULT_CONFIG._replace(no_merge_references=False)).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [
            [(SegmentationTagNames.ANNEX, TOKEN_1),
             (SegmentationTagNames.ANNEX, TOKEN_2)],
            [(SegmentationTagNames.ANNEX, TOKEN_3),
             (SegmentationTagNames.ANNEX, TOKEN_4)]
        ]
Пример #10
0
 def test_should_not_annotate_out_of_order_page_number(self):
     doc = _simple_document_with_tagged_token_lines(lines=[[(
         None,
         '2')], [(FrontTagNames.TITLE,
                  TOKEN_1)], [(PageTagNames.PAGE,
                               '2')], [(BodyTagNames.SECTION_TITLE,
                                        TOKEN_2)], [(PageTagNames.PAGE,
                                                     '3')]])
     SegmentationAnnotator(DEFAULT_CONFIG, preserve_tags=True).annotate(doc)
     assert _get_document_tagged_token_lines(doc) == [
         [(SegmentationTagNames.FRONT, '2')],
         [(SegmentationTagNames.FRONT, TOKEN_1)],
         [(SegmentationTagNames.PAGE, '2')],
         [(SegmentationTagNames.BODY, TOKEN_2)],
         [(SegmentationTagNames.PAGE, '3')]
     ]
Пример #11
0
    def test_should_not_annotate_preserved_page_numbers_as_headnote(self):
        doc = _simple_document_with_tagged_token_lines(lines=[
            [(None, '1')],
            [(FrontTagNames.TITLE, TOKEN_1)],
            [(None, '1')],
            [(BodyTagNames.SECTION_TITLE, TOKEN_2)],
        ])
        all_tokens = list(doc.iter_all_tokens())
        doc._set_preserved_tag(all_tokens[0], PageTagNames.PAGE)  # pylint: disable=protected-access
        doc._set_preserved_tag(all_tokens[2], PageTagNames.PAGE)  # pylint: disable=protected-access

        SegmentationAnnotator(DEFAULT_CONFIG, preserve_tags=True).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[
            (SegmentationTagNames.PAGE, '1')
        ], [(SegmentationTagNames.FRONT, TOKEN_1)
            ], [(SegmentationTagNames.PAGE, '1')
                ], [(SegmentationTagNames.BODY, TOKEN_2)]]
Пример #12
0
    def test_should_annotate_page_header(self):
        doc = _simple_document_with_tagged_token_lines(lines=[
            [(None, t) for t in LONG_PAGE_HEADER_TEXT_1.split(' ')],
            [(FrontTagNames.TITLE, TOKEN_1)],
            [(None, t) for t in LONG_PAGE_HEADER_TEXT_1.split(' ')],
            [(FrontTagNames.ABSTRACT, TOKEN_2)],
        ])

        SegmentationAnnotator(DEFAULT_CONFIG).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [
            [(SegmentationTagNames.HEADNOTE, t)
             for t in LONG_PAGE_HEADER_TEXT_1.split(' ')],
            [(SegmentationTagNames.FRONT, TOKEN_1)],
            [(SegmentationTagNames.HEADNOTE, t)
             for t in LONG_PAGE_HEADER_TEXT_1.split(' ')],
            [(SegmentationTagNames.FRONT, TOKEN_2)]
        ]
Пример #13
0
    def test_should_merge_separate_reference_if_enabled(self):
        doc = _simple_document_with_tagged_token_lines(lines=[
            [(add_tag_prefix(BackTagNames.REFERENCE, prefix=B_TAG_PREFIX),
              TOKEN_1),
             (add_tag_prefix(BackTagNames.REFERENCE, prefix=I_TAG_PREFIX),
              TOKEN_2)],
            [(add_tag_prefix(BackTagNames.REFERENCE, prefix=B_TAG_PREFIX),
              TOKEN_3),
             (add_tag_prefix(BackTagNames.REFERENCE, prefix=I_TAG_PREFIX),
              TOKEN_4)]
        ])

        SegmentationAnnotator(
            DEFAULT_CONFIG._replace(no_merge_references=False)).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [
            [(SegmentationTagNames.REFERENCE, TOKEN_1),
             (SegmentationTagNames.REFERENCE, TOKEN_2)],
            [(SegmentationTagNames.REFERENCE, TOKEN_3),
             (SegmentationTagNames.REFERENCE, TOKEN_4)]
        ]
Пример #14
0
    def test_should_annotate_not_fail_on_empty_line(self):
        doc = _simple_document_with_tagged_token_lines(
            lines=[[], [(None, TOKEN_1)]])

        SegmentationAnnotator(DEFAULT_CONFIG, preserve_tags=True).annotate(doc)
        assert _get_document_tagged_token_lines(doc) == [[], [(None, TOKEN_1)]]
Пример #15
0
 def test_should_not_fail_on_empty_document(self):
     structured_document = GrobidTrainingTeiStructuredDocument(_tei())
     SegmentationAnnotator(DEFAULT_CONFIG).annotate(structured_document)