def test_should_preserve_empty_pages_if_requested(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(
         layout_document, preserve_empty_pages=True)
     assert len(cleaned_layout_document.pages) == 2
 def test_should_remove_empty_line_block_and_page(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(layout_document)
     assert len(cleaned_layout_document.pages) == 1
     line = cleaned_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
 def test_should_retokenize_document_with_placeholders(self):
     text = 'token1 token2'
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([
                 LayoutToken(text,
                             whitespace='\n',
                             coordinates=LayoutPageCoordinates(
                                 x=10, y=10, width=100, height=50))
             ])
         ],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1', 'token2']
     assert [t.whitespace for t in line.tokens] == [' ', '\n']
     assert line.tokens[0].coordinates.x == 10.0
     assert line.tokens[0].coordinates.width == 100 * len('token1') / len(
         text)
     assert line.tokens[
         1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text)
     assert line.tokens[1].coordinates.width == 100 * len('token2') / len(
         text)
Exemplo n.º 4
0
 def test_should_provide_line_text(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('first1 second1 this is a line'),
                 LayoutLine.for_text('first2 second2 this is a line')
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_text':
             features.line_text,
             'token_text':
             features.token_text,
             'second_token_text':
             features.second_token_text
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_text': 'first1 second1 this is a line',
             'token_text': 'first1',
             'second_token_text': 'second1'
         },
         {
             'line_text': 'first2 second2 this is a line',
             'token_text': 'first2',
             'second_token_text': 'second2'
         },
     ]
Exemplo n.º 5
0
 def test_should_provide_block_relative_line_length(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('1'),
                 LayoutLine.for_text('12'),
                 LayoutLine.for_text('1234567890'),
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_block_relative_line_length_feature':
             (features.get_str_block_relative_line_length_feature())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'str_block_relative_line_length_feature': '1',  # 1 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '2',  # 2 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '10',  # 10 * 10 / 10
         },
     ]
Exemplo n.º 6
0
 def test_should_provide_page_and_block_status_for_single_token_blocks(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_text('line1'),
             LayoutBlock.for_text('line2'),
             LayoutBlock.for_text('line3')
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'page_status': features.get_page_status(),
             'block_status': features.get_block_status()
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'page_status': 'PAGESTART',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEIN',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEEND',
         'block_status': 'BLOCKSTART'
     }]
Exemplo n.º 7
0
    def test_should_extract_references_fields_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        label_block = LayoutBlock.for_text('1')
        ref_title_block = LayoutBlock.for_text('Reference Title 1')
        ref_text_block = LayoutBlock.merge_blocks([ref_title_block])
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            ref_title_block, '<title>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
        ref_citations = list(
            semantic_document.iter_by_type_recursively(
                SemanticReferenceCitation))
        assert len(ref_citations) == 1
        assert ref_citations[0].target_content_id == 'b0'
 def test_should_remove_blank_token(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert line.tokens == []
Exemplo n.º 9
0
 def get_filtered_document_by_labels(self, labels: List[str]):  # pylint: disable=too-many-branches
     layout_document = LayoutDocument(pages=[])
     layout_document_labels = self.get_layout_document_labels_by_labels(
         labels)
     if not layout_document_labels:
         LOGGER.warning(
             'no layout_lines_to_include found for: %r, available keys=%r',
             labels, self.layout_document_labels_by_label.keys())
         return layout_document
     layout_token_ids_to_include = {
         id(layout_document_label.layout_token)
         for layout_document_label in layout_document_labels
         if layout_document_label.layout_token
     }
     LOGGER.debug('layout_tokens_to_include: %s',
                  layout_token_ids_to_include)
     layout_line_ids_to_include: Set[int] = set()
     if not layout_token_ids_to_include:
         layout_line_ids_to_include = {
             id(layout_document_label.layout_line)
             for layout_document_label in layout_document_labels
             if layout_document_label.layout_line
         }
     LOGGER.debug('layout_line_ids_to_include: %s',
                  layout_line_ids_to_include)
     result_page: Optional[LayoutPage] = None
     for page in self.layout_document.pages:  # pylint: disable=too-many-nested-blocks
         result_page = None
         result_block: Optional[LayoutBlock] = None
         for block in page.blocks:
             result_block = None
             for line in block.lines:
                 accepted_line: Optional[LayoutLine] = None
                 if layout_token_ids_to_include:
                     accepted_tokens: List[LayoutToken] = []
                     for token in line.tokens:
                         if id(token) in layout_token_ids_to_include:
                             accepted_tokens.append(token)
                     if not accepted_tokens:
                         continue
                     if len(line.tokens) == accepted_tokens:
                         accepted_line = line
                     else:
                         accepted_line = LayoutLine(tokens=accepted_tokens)
                 else:
                     if id(line) not in layout_line_ids_to_include:
                         continue
                     accepted_line = line
                 if result_page is None:
                     result_page = LayoutPage(blocks=[])
                     layout_document.pages.append(result_page)
                 if result_block is None:
                     result_block = LayoutBlock(lines=[])
                     result_page.blocks.append(result_block)
                 result_block.lines.append(accepted_line)
     return layout_document
 def test_should_not_retokenize_document_with_valid_tokens(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])],
             graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
 def test_should_provide_empty_list_for_empty_pages(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[LayoutBlock.for_text('token1'), EMPTY_BLOCK],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = remove_empty_blocks(layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
 def test_should_provide_page_number_with_uncomment_page_dimension(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1))),
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=2,
                        coordinates=LAYOUT_PAGE_COORDINATES_2._replace(
                            page_number=2))),
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=3,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=3)))
     ])
     result = get_page_numbers_with_uncommon_page_dimension(layout_document)
     assert result == [2]
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
Exemplo n.º 15
0
def _create_page(
    coordinates: LayoutPageCoordinates,
    graphics: Optional[Sequence[LayoutGraphic]] = None
) -> LayoutPage:
    return LayoutPage(
        meta=LayoutPageMeta(
            page_number=coordinates.page_number,
            coordinates=coordinates
        ),
        blocks=[],
        graphics=graphics or []
    )
Exemplo n.º 16
0
 def test_should_provide_repetitive_pattern_feature(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_text('this is repetitive'),
             LayoutBlock.for_text('this is not')
         ]),
         LayoutPage(blocks=[
             LayoutBlock.for_text('this is repetitive'),
             LayoutBlock.for_text('it is different')
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'get_str_is_repetitive_pattern':
             (features.get_str_is_repetitive_pattern()),
             'get_str_is_first_repetitive_pattern':
             (features.get_str_is_first_repetitive_pattern())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'get_str_is_repetitive_pattern': '1',
             'get_str_is_first_repetitive_pattern': '1'
         },
         {
             'get_str_is_repetitive_pattern': '0',
             'get_str_is_first_repetitive_pattern': '0'
         },
         {
             'get_str_is_repetitive_pattern': '1',
             'get_str_is_first_repetitive_pattern': '0'
         },
         {
             'get_str_is_repetitive_pattern': '0',
             'get_str_is_first_repetitive_pattern': '0'
         },
     ]
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([LayoutToken('token1 token2')])
         ],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
Exemplo n.º 18
0
    def test_should_extract_table_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Table 1')
        label_block = LayoutBlock.for_text('Table 1')
        caption_block = LayoutBlock.for_text('Caption 1')
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_table_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        table_model_mock = fulltext_models_mock.table_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<table_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<table>')

        table_model_mock.update_label_by_layout_block(label_block, '<label>')
        table_model_mock.update_label_by_layout_block(caption_block,
                                                      '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        table_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticTable))
        assert len(table_list) == 1
        table = table_list[0]
        assert table.get_text_by_type(SemanticLabel) == label_block.text
        assert table.get_text_by_type(SemanticCaption) == caption_block.text
        assert table.content_id == 'tab_0'
        table_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticTableCitation))
        assert len(table_citation_list) == 1
        assert table_citation_list[0].get_text() == citation_block.text
        assert table_citation_list[0].target_content_id == 'tab_0'
Exemplo n.º 19
0
def get_layout_page_with_text_or_graphic_replaced_by_graphic(
        layout_page: LayoutPage, semantic_graphic: SemanticGraphic,
        is_only_semantic_graphic_on_page: bool,
        is_replace_overlapping_text: bool) -> LayoutPage:
    layout_graphic = semantic_graphic.layout_graphic
    assert layout_graphic
    assert layout_graphic.coordinates
    graphic_bounding_box = layout_graphic.coordinates.bounding_box
    if is_only_semantic_graphic_on_page:
        layout_graphic = layout_graphic._replace(
            related_block=LayoutBlock.for_tokens(
                list(layout_page.iter_all_tokens())))
    modified_layout_page = (layout_page.replace(graphics=[
        _layout_graphic for _layout_graphic in layout_page.graphics
        if not is_layout_graphic_within_bounding_box(
            _layout_graphic, bounding_box=graphic_bounding_box)
    ] + [layout_graphic]))
    if is_replace_overlapping_text:
        modified_layout_page = (modified_layout_page.flat_map_layout_tokens(
            functools.partial(
                _remove_tokens_within_bounding_box_flatmap_fn,
                bounding_box=graphic_bounding_box)).remove_empty_blocks())
    return modified_layout_page
Exemplo n.º 20
0
    def test_should_extract_editor_names_from_references_fields(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        editors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        ref_text_block = LayoutBlock.merge_blocks([editors_block])
        ref_block = LayoutBlock.merge_blocks([ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True,
                                    extract_citation_authors=False,
                                    extract_citation_editors=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock
        name_citation_model_mock = fulltext_models_mock.name_citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            editors_block, '<editor>')

        name_citation_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_citation_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        editors = list(ref.iter_by_type(SemanticEditor))
        assert len(editors) == 1
        assert editors[0].given_name_text == given_name_block.text
        assert editors[0].surname_text == surname_block.text
Exemplo n.º 21
0
    def test_should_extract_affiliation_address_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        marker_block = LayoutBlock.for_text('1')
        institution_block = LayoutBlock.for_text('Institution1')
        country_block = LayoutBlock.for_text('Country1')
        aff_block = LayoutBlock.merge_blocks([marker_block, institution_block])
        address_block = LayoutBlock.merge_blocks([country_block])
        aff_address_block = LayoutBlock.merge_blocks(
            [aff_block, address_block])
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = aff_address_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(aff_block,
                                                       '<affiliation>')
        header_model_mock.update_label_by_layout_block(address_block,
                                                       '<address>')

        affiliation_address_model_mock.update_label_by_layout_block(
            marker_block, '<marker>')
        affiliation_address_model_mock.update_label_by_layout_block(
            institution_block, '<institution>')
        affiliation_address_model_mock.update_label_by_layout_block(
            country_block, '<country>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == aff_address_block.text
        assert (semantic_document.front.view_by_type(
            SemanticAffiliationAddress).get_text()) == aff_address_block.text
        affiliations = list(
            semantic_document.front.iter_by_type(SemanticAffiliationAddress))
        assert len(affiliations) == 1
        assert affiliations[0].get_text_by_type(
            SemanticMarker) == marker_block.text
        assert affiliations[0].get_text_by_type(
            SemanticInstitution) == institution_block.text
        assert affiliations[0].get_text_by_type(
            SemanticCountry) == country_block.text
        assert affiliations[0].content_id == 'aff0'
 def test_should_ignore_small_bitmap(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    graphics=[
                        LayoutGraphic(
                            graphic_type='image',
                            coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                                page_number=1, width=1, height=1))
                    ],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
Exemplo n.º 23
0
    def test_should_extract_invalid_reference_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        invalid_reference_block = LayoutBlock.for_text(
            'This is an invalid reference 1')
        ref_text_block = invalid_reference_block
        ref_block = ref_text_block
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            invalid_reference_block, 'O')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(
            reference_list[0].iter_by_type(SemanticInvalidReference))
        assert len(references) == 1
        assert references[0].get_text() == invalid_reference_block.text
Exemplo n.º 24
0
    def test_should_extract_from_document(
            self, fulltext_models_mock: MockFullTextModels):
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = LayoutBlock.for_text('This is the header')
        body_block = LayoutBlock.for_text('This is the body')
        acknowledgment_block = LayoutBlock.for_text('Some acknowledgement')
        back_block = LayoutBlock.for_text('This is the back')

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')
        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<acknowledgement>')
        segmentation_model_mock.update_label_by_layout_block(
            back_block, '<annex>')

        header_model_mock.update_label_by_layout_block(header_block, '<title>')

        fulltext_model_mock.update_label_by_layout_block(
            body_block, '<paragraph>')
        fulltext_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<paragraph>')
        fulltext_model_mock.update_label_by_layout_block(
            back_block, '<paragraph>')

        layout_document = LayoutDocument(pages=[
            LayoutPage(blocks=[
                header_block, body_block, acknowledgment_block, back_block
            ])
        ])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == header_block.text
        assert semantic_document.front.get_text_by_type(
            SemanticTitle) == header_block.text
        assert semantic_document.body_section.get_text() == body_block.text
        assert semantic_document.back_section.view_by_section_type(
            SemanticSectionTypes.OTHER).get_text() == back_block.text
        assert semantic_document.back_section.view_by_section_type(
            SemanticSectionTypes.ACKNOWLEDGEMENT).get_text(
            ) == acknowledgment_block.text
Exemplo n.º 25
0
    def __init__(self) -> None:
        self.title_block = LayoutBlock.for_text('This is the title')

        self.author_surname_block = LayoutBlock.for_text('Author Surname 1')
        self.author_block = LayoutBlock.merge_blocks(
            [self.author_surname_block])

        self.institution_block = LayoutBlock.for_text('Institution 1')
        self.affiliation_block = LayoutBlock.merge_blocks(
            [self.institution_block])

        self.header_block = LayoutBlock.merge_blocks(
            [self.title_block, self.author_block, self.affiliation_block])

        self.figure_head_block = LayoutBlock.for_text('Figure 1')
        self.figure_block = LayoutBlock.merge_blocks([self.figure_head_block])

        self.table_head_block = LayoutBlock.for_text('Table 1')
        self.table_block = LayoutBlock.merge_blocks([self.table_head_block])

        self.body_section_title_block = LayoutBlock.for_text('Section 1')
        self.body_section_paragraph_block = LayoutBlock.for_text('Paragraph 1')
        self.body_block = LayoutBlock.merge_blocks([
            self.body_section_title_block, self.body_section_paragraph_block,
            self.figure_block, self.table_block
        ])

        self.ref_author_surname_block = LayoutBlock.for_text(
            'Ref Author Surname 1')
        self.ref_author_block = LayoutBlock.merge_blocks(
            [self.ref_author_surname_block])

        self.ref_label_block = LayoutBlock.for_text('1')
        self.ref_title_block = LayoutBlock.for_text('Reference 1')
        self.ref_text_block = LayoutBlock.merge_blocks(
            [self.ref_title_block, self.ref_author_block])
        self.ref_ref_block = LayoutBlock.merge_blocks(
            [self.ref_label_block, self.ref_text_block])

        self.layout_document = LayoutDocument(pages=[
            LayoutPage(blocks=[
                self.header_block, self.body_block, self.ref_ref_block
            ])
        ])
Exemplo n.º 26
0
 def test_should_filter_by_line_without_token(self):
     tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')),
                     (TAG_2, LayoutLine.for_text('this is line 2'))]
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=line.text,
                          layout_line=line,
                          layout_token=None) for tag, line in tagged_lines
         for token in line.tokens
     ]
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])])
     ])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, line in tagged_lines:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
Exemplo n.º 27
0
 def test_should_provide_block_relative_document_token_position(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine.for_text(f'line{i}') for i in range(10)])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_relative_document_position':
             (features.get_str_relative_document_position())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'str_relative_document_position':
         str(feature_linear_scaling_int(i, 10, NBBINS_POSITION)),
     } for i in range(10)]
Exemplo n.º 28
0
 def test_should_filter_by_token_label(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')),
                      (TAG_2, get_layout_tokens_for_text('this is line 2'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, tokens in tagged_tokens:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(tokens))
Exemplo n.º 29
0
    def test_should_extract_acknowledgement_only(
            self, fulltext_models_mock: MockFullTextModels):
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        acknowledgment_block = LayoutBlock.for_text('Some acknowledgement')

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<acknowledgement>')

        fulltext_model_mock.update_label_by_layout_block(
            acknowledgment_block, '<paragraph>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[acknowledgment_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.back_section.view_by_section_type(
            SemanticSectionTypes.ACKNOWLEDGEMENT).get_text(
            ) == acknowledgment_block.text
Exemplo n.º 30
0
 def test_should_filter_by_token_multiple_labels(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')),
                      (TAG_2, get_layout_tokens_for_text('tokens tag 2')),
                      (TAG_3, get_layout_tokens_for_text('tokens tag 3'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     assert join_layout_tokens(
         layout_document_label_result.get_filtered_document_by_labels([
             TAG_1, TAG_3
         ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] +
                                                     tagged_tokens[2][1])