Пример #1
0
 def test_should_provide_page_and_block_status_for_multi_line_blocks(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('line1'),
                 LayoutLine.for_text('line2'),
                 LayoutLine.for_text('line3')
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'page_status': features.get_page_status(),
             'block_status': features.get_block_status()
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'page_status': 'PAGESTART',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEIN',
         'block_status': 'BLOCKIN'
     }, {
         'page_status': 'PAGEEND',
         'block_status': 'BLOCKEND'
     }]
Пример #2
0
 def test_should_provide_block_relative_line_length(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('1'),
                 LayoutLine.for_text('12'),
                 LayoutLine.for_text('1234567890'),
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_block_relative_line_length_feature':
             (features.get_str_block_relative_line_length_feature())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'str_block_relative_line_length_feature': '1',  # 1 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '2',  # 2 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '10',  # 10 * 10 / 10
         },
     ]
Пример #3
0
 def test_should_provide_line_text(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('first1 second1 this is a line'),
                 LayoutLine.for_text('first2 second2 this is a line')
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_text':
             features.line_text,
             'token_text':
             features.token_text,
             'second_token_text':
             features.second_token_text
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_text': 'first1 second1 this is a line',
             'token_text': 'first1',
             'second_token_text': 'second1'
         },
         {
             'line_text': 'first2 second2 this is a line',
             'token_text': 'first2',
             'second_token_text': 'second2'
         },
     ]
 def test_should_add_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
 def test_should_preserve_empty_pages_if_requested(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(
         layout_document, preserve_empty_pages=True)
     assert len(cleaned_layout_document.pages) == 2
 def test_should_remove_empty_line_block_and_page(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(layout_document)
     assert len(cleaned_layout_document.pages) == 1
     line = cleaned_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
 def test_should_lb_elements_before_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     lb_nodes = tei_xpath(nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_add_line_feeds(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
Пример #9
0
 def test_should_add_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
Пример #10
0
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
Пример #12
0
def iter_layout_lines_from_layout_tokens(
        layout_tokens: Iterable[LayoutToken]) -> Iterable[LayoutLine]:
    line_layout_tokens: List[LayoutToken] = []
    for layout_token in layout_tokens:
        if not line_layout_tokens:
            line_layout_tokens.append(layout_token)
            continue
        if (layout_token.line_descriptor.line_id ==
                line_layout_tokens[0].line_descriptor.line_id):
            LOGGER.debug('line id matching: %r - %r', layout_token,
                         line_layout_tokens[0])
            line_layout_tokens.append(layout_token)
            continue
        yield LayoutLine(tokens=line_layout_tokens)
        line_layout_tokens = [layout_token]
    if line_layout_tokens:
        yield LayoutLine(tokens=line_layout_tokens)
Пример #13
0
 def parse_line(self, line_node: etree.ElementBase,
                page_number: int) -> LayoutLine:
     return LayoutLine(tokens=[
         self.parse_token(token_node,
                          page_number=page_number,
                          layout_line_descriptor=LayoutLineDescriptor(
                              line_id=id(line_node)))
         for token_node in alto_xpath(line_node, './/alto:String')
     ])
Пример #14
0
 def get_filtered_document_by_labels(self, labels: List[str]):  # pylint: disable=too-many-branches
     layout_document = LayoutDocument(pages=[])
     layout_document_labels = self.get_layout_document_labels_by_labels(
         labels)
     if not layout_document_labels:
         LOGGER.warning(
             'no layout_lines_to_include found for: %r, available keys=%r',
             labels, self.layout_document_labels_by_label.keys())
         return layout_document
     layout_token_ids_to_include = {
         id(layout_document_label.layout_token)
         for layout_document_label in layout_document_labels
         if layout_document_label.layout_token
     }
     LOGGER.debug('layout_tokens_to_include: %s',
                  layout_token_ids_to_include)
     layout_line_ids_to_include: Set[int] = set()
     if not layout_token_ids_to_include:
         layout_line_ids_to_include = {
             id(layout_document_label.layout_line)
             for layout_document_label in layout_document_labels
             if layout_document_label.layout_line
         }
     LOGGER.debug('layout_line_ids_to_include: %s',
                  layout_line_ids_to_include)
     result_page: Optional[LayoutPage] = None
     for page in self.layout_document.pages:  # pylint: disable=too-many-nested-blocks
         result_page = None
         result_block: Optional[LayoutBlock] = None
         for block in page.blocks:
             result_block = None
             for line in block.lines:
                 accepted_line: Optional[LayoutLine] = None
                 if layout_token_ids_to_include:
                     accepted_tokens: List[LayoutToken] = []
                     for token in line.tokens:
                         if id(token) in layout_token_ids_to_include:
                             accepted_tokens.append(token)
                     if not accepted_tokens:
                         continue
                     if len(line.tokens) == accepted_tokens:
                         accepted_line = line
                     else:
                         accepted_line = LayoutLine(tokens=accepted_tokens)
                 else:
                     if id(line) not in layout_line_ids_to_include:
                         continue
                     accepted_line = line
                 if result_page is None:
                     result_page = LayoutPage(blocks=[])
                     layout_document.pages.append(result_page)
                 if result_block is None:
                     result_block = LayoutBlock(lines=[])
                     result_page.blocks.append(result_block)
                 result_block.lines.append(accepted_line)
     return layout_document
 def test_should_keep_original_whitespace(self):
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == text
Пример #16
0
 def test_should_filter_by_line_without_token(self):
     tagged_lines = [(TAG_1, LayoutLine.for_text('this is line 1')),
                     (TAG_2, LayoutLine.for_text('this is line 2'))]
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=line.text,
                          layout_line=line,
                          layout_token=None) for tag, line in tagged_lines
         for token in line.tokens
     ]
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[line for _, line in tagged_lines])])
     ])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, line in tagged_lines:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(line.tokens))
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
 def test_should_keep_original_whitespace(self):
     training_data_generator = get_tei_training_data_generator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == text
Пример #19
0
 def test_should_keep_original_whitespace(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == text
Пример #20
0
 def test_should_filter_by_token_label(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('this is line 1')),
                      (TAG_2, get_layout_tokens_for_text('this is line 2'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     for tag, tokens in tagged_tokens:
         assert (join_layout_tokens(
             layout_document_label_result.get_filtered_document_by_label(
                 tag).iter_all_tokens()) == join_layout_tokens(tokens))
Пример #21
0
 def test_should_provide_block_relative_document_token_position(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine.for_text(f'line{i}') for i in range(10)])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_relative_document_position':
             (features.get_str_relative_document_position())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'str_relative_document_position':
         str(feature_linear_scaling_int(i, 10, NBBINS_POSITION)),
     } for i in range(10)]
Пример #22
0
 def test_should_filter_by_token_multiple_labels(self):
     tagged_tokens = [(TAG_1, get_layout_tokens_for_text('tokens tag 1')),
                      (TAG_2, get_layout_tokens_for_text('tokens tag 2')),
                      (TAG_3, get_layout_tokens_for_text('tokens tag 3'))]
     line = LayoutLine(
         [token for _, tokens in tagged_tokens for token in tokens])
     layout_model_labels = [
         LayoutModelLabel(label=tag,
                          label_token_text=token.text,
                          layout_line=line,
                          layout_token=token)
         for tag, tokens in tagged_tokens for token in tokens
     ]
     layout_document = LayoutDocument(
         pages=[LayoutPage(blocks=[LayoutBlock(lines=[line])])])
     layout_document_label_result = LayoutDocumentLabelResult(
         layout_document, layout_model_labels)
     assert join_layout_tokens(
         layout_document_label_result.get_filtered_document_by_labels([
             TAG_1, TAG_3
         ]).iter_all_tokens()) == join_layout_tokens(tagged_tokens[0][1] +
                                                     tagged_tokens[2][1])
Пример #23
0
 def test_should_provide_punctuation_profile(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock(lines=[
                 LayoutLine.for_text('a .: b'),
             ])])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'line_punctuation_profile':
             features.get_line_punctuation_profile(),
             'line_punctuation_profile_length_feature':
             (features.get_line_punctuation_profile_length_feature()),
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'line_punctuation_profile': '.:',
             'line_punctuation_profile_length_feature': '2'
         },
     ]
Пример #24
0
    ModelDataGenerator, LayoutModelData, feature_linear_scaling_int,
    _LINESCALE, get_str_bool_feature_value)

LOGGER = logging.getLogger(__name__)

NBSP = '\u00A0'


def format_feature_text(text: str) -> str:
    return re.sub(" |\t", NBSP, text.strip())


NBBINS_POSITION = 12

EMPTY_LAYOUT_TOKEN = LayoutToken('')
EMPTY_LAYOUT_LINE = LayoutLine([])


def get_block_status(line_index: int, line_count: int) -> str:
    return ('BLOCKSTART' if line_index == 0 else
            ('BLOCKEND' if line_index == line_count - 1 else 'BLOCKIN'))


def get_page_status(block_index: int, block_count: int,
                    is_first_block_token: bool,
                    is_last_block_token: bool) -> str:
    return ('PAGESTART' if block_index == 0 and is_first_block_token else
            ('PAGEEND' if block_index == block_count -
             1 and is_last_block_token else 'PAGEIN'))

def get_layout_line_for_text(text: str, line_id: int) -> LayoutLine:
    return LayoutLine.for_text(
        text,
        tail_whitespace='\n',
        line_descriptor=LayoutLineDescriptor(line_id=line_id))