示例#1
0
 def test_should_return_samefont_if_font_size_is_the_same(self):
     assert get_token_font_size_feature(
         previous_token=LayoutToken('',
                                    font=LayoutFont(font_id='dummy',
                                                    font_size=1)),
         current_token=LayoutToken(
             '', font=LayoutFont(font_id='dummy',
                                 font_size=1))) == 'SAMEFONTSIZE'
示例#2
0
 def test_should_return_lowerfont_if_font_size_is_smaller(self):
     assert get_token_font_size_feature(
         previous_token=LayoutToken('',
                                    font=LayoutFont(font_id='dummy',
                                                    font_size=2)),
         current_token=LayoutToken(
             '', font=LayoutFont(font_id='dummy',
                                 font_size=1))) == 'LOWERFONT'
示例#3
0
 def test_should_return_higherfont_if_new_font_has_no_size(self):
     assert get_token_font_size_feature(
         previous_token=LayoutToken('',
                                    font=LayoutFont(font_id='dummy',
                                                    font_size=1)),
         current_token=LayoutToken(
             '', font=LayoutFont(font_id='dummy',
                                 font_size=None))) == 'HIGHERFONT'
示例#4
0
 def test_should_return_higherfont_if_font_size_is_larger(self):
     assert get_token_font_size_feature(
         previous_token=LayoutToken('',
                                    font=LayoutFont(font_id='dummy',
                                                    font_size=1)),
         current_token=LayoutToken(
             '', font=LayoutFont(font_id='dummy',
                                 font_size=2))) == 'HIGHERFONT'
示例#5
0
 def test_should_add_superscript_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     assert get_tei_xpath_text_content_list(
         node, './tei:hi[@rend="superscript"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
示例#7
0
 def test_should_detect_indented_blocks(self):
     line_indentation_status_feature = LineIndentationStatusFeature()
     line_indentation_status_feature.on_new_block()
     line_indentation_status_feature.on_new_line()
     assert line_indentation_status_feature.get_is_indented_and_update(
         LayoutToken('x',
                     coordinates=LayoutPageCoordinates(
                         x=10, y=10, width=10, height=10))) is False
     line_indentation_status_feature.on_new_line()
     assert line_indentation_status_feature.get_is_indented_and_update(
         LayoutToken('x',
                     coordinates=LayoutPageCoordinates(
                         x=50, y=10, width=10, height=10))) is True
示例#8
0
 def test_should_add_bold_and_italics_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     LOGGER.debug('xml: %r', etree.tostring(node))
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="bold"]') == [TOKEN_2]
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="italic"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
 def test_should_create_lines_based_on_line_descriptor(self):
     line_descriptor_1 = LayoutLineDescriptor(line_id=1)
     line_descriptor_2 = LayoutLineDescriptor(line_id=2)
     line_tokens_1 = [
         LayoutToken(text, line_descriptor=line_descriptor_1)
         for text in ['token1.1', 'token1.2']
     ]
     line_tokens_2 = [
         LayoutToken(text, line_descriptor=line_descriptor_2)
         for text in ['token2.1', 'token2.2']
     ]
     layout_block = LayoutBlock.for_tokens(line_tokens_1 + line_tokens_2)
     assert len(layout_block.lines) == 2
     assert layout_block.lines[0].tokens == line_tokens_1
     assert layout_block.lines[1].tokens == line_tokens_2
 def test_should_retokenize_document_with_placeholders(self):
     text = 'token1 token2'
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([
                 LayoutToken(text,
                             whitespace='\n',
                             coordinates=LayoutPageCoordinates(
                                 x=10, y=10, width=100, height=50))
             ])
         ],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1', 'token2']
     assert [t.whitespace for t in line.tokens] == [' ', '\n']
     assert line.tokens[0].coordinates.x == 10.0
     assert line.tokens[0].coordinates.width == 100 * len('token1') / len(
         text)
     assert line.tokens[
         1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text)
     assert line.tokens[1].coordinates.width == 100 * len('token2') / len(
         text)
示例#11
0
 def test_should_be_able_to_set_title_with_italic_layout_tokens(self):
     title_block = LayoutBlock.for_tokens([
         LayoutToken('rend'),
         LayoutToken('italic1', font=ITALICS_FONT_1),
         LayoutToken('test')
     ])
     document = TeiDocument()
     document.set_title_layout_block(title_block)
     LOGGER.debug('xml: %r', etree.tostring(document.root))
     nodes = document.root.xpath(
         '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]',
         namespaces=TEI_NS_MAP)
     assert len(nodes) == 1
     title_node = nodes[0]
     assert get_tei_xpath_text_content_list(
         title_node, './tei:hi[@rend="italic"]') == ['italic1']
     assert document.get_title() == 'rend italic1 test'
 def test_should_remove_blank_token(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[LayoutBlock.for_tokens([LayoutToken(' ')])],
                    graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert line.tokens == []
示例#13
0
 def parse_token(
         self, token_node: etree.ElementBase, page_number: int,
         layout_line_descriptor: LayoutLineDescriptor) -> LayoutToken:
     return LayoutToken(
         text=token_node.attrib.get('CONTENT') or '',
         font=self.font_by_id_map.get(token_node.attrib.get('STYLEREFS'),
                                      EMPTY_FONT),
         coordinates=self.parse_page_coordinates(token_node,
                                                 page_number=page_number),
         line_descriptor=layout_line_descriptor)
 def test_should_not_retokenize_document_with_valid_tokens(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(
             blocks=[LayoutBlock.for_tokens([LayoutToken('token1')])],
             graphics=[])
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     line = retokenized_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
示例#15
0
 def test_should_return_false_if_no_font_size_available(self):
     layout_tokens = [
         LayoutToken('', font=LayoutFont('font1', font_size=None)),
         LayoutToken('', font=LayoutFont('font2', font_size=None)),
         LayoutToken('', font=LayoutFont('font3', font_size=None)),
         LayoutToken('', font=LayoutFont('font4', font_size=None))
     ]
     relative_font_size_feature = RelativeFontSizeFeature(layout_tokens)
     assert [
         relative_font_size_feature.is_smallest_font_size(layout_token)
         for layout_token in layout_tokens
     ] == [False, False, False, False]
     assert [
         relative_font_size_feature.is_largest_font_size(layout_token)
         for layout_token in layout_tokens
     ] == [False, False, False, False]
     assert [
         relative_font_size_feature.is_larger_than_average_font_size(
             layout_token) for layout_token in layout_tokens
     ] == [False, False, False, False]
示例#16
0
 def test_should_return_is_smallest_largest_and_larger_than_avg(self):
     layout_tokens = [
         LayoutToken('', font=LayoutFont('font1', font_size=1)),
         LayoutToken('', font=LayoutFont('font2', font_size=2)),
         LayoutToken('', font=LayoutFont('font3', font_size=3)),
         LayoutToken('', font=LayoutFont('font4', font_size=4))
     ]
     relative_font_size_feature = RelativeFontSizeFeature(layout_tokens)
     assert [
         relative_font_size_feature.is_smallest_font_size(layout_token)
         for layout_token in layout_tokens
     ] == [True, False, False, False]
     assert [
         relative_font_size_feature.is_largest_font_size(layout_token)
         for layout_token in layout_tokens
     ] == [False, False, False, True]
     assert [
         relative_font_size_feature.is_larger_than_average_font_size(
             layout_token) for layout_token in layout_tokens
     ] == [False, False, True, True]
 def test_should_select_tokens_based_on_index(self):
     token_1 = LayoutToken(text='token1', whitespace=' ')
     token_2 = LayoutToken(text='token2', whitespace=' ')
     layout_tokens_text = LayoutTokensText(
         LayoutBlock.for_tokens([token_1, token_2]))
     assert str(layout_tokens_text) == 'token1 token2'
     assert layout_tokens_text.get_layout_tokens_between(0, 1) == [token_1]
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) - 1, len(token_1.text)) == [token_1]
     assert not layout_tokens_text.get_layout_tokens_between(
         len(token_1.text),
         len(token_1.text) + 1)
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1,
         len(token_1.text) + 2) == [token_2]
     assert layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1 + len(token_2.text) - 1,
         len(token_1.text) + 1 + len(token_2.text)) == [token_2]
     assert not layout_tokens_text.get_layout_tokens_between(
         len(token_1.text) + 1 + len(token_2.text),
         len(token_1.text) + 1 + len(token_2.text) + 1)
 def test_should_preserve_empty_pages_if_requested(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(
         layout_document, preserve_empty_pages=True)
     assert len(cleaned_layout_document.pages) == 2
示例#19
0
def iter_semantic_markers_for_layout_block(
    layout_block: LayoutBlock
) -> Iterable[Union[SemanticMarker, SemanticContentWrapper]]:
    for text in re.split(r'(\D)', layout_block.text):
        if not text:
            continue
        local_block = LayoutBlock.for_tokens(
            [LayoutToken(text, whitespace='')])
        if text == ',' or text.isspace():
            yield SemanticNote(layout_block=local_block,
                               note_type='marker_delimiter')
            continue
        yield SemanticMarker(layout_block=local_block)
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([LayoutToken('token1 token2')])
         ],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
 def test_should_remove_empty_line_block_and_page(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(layout_document)
     assert len(cleaned_layout_document.pages) == 1
     line = cleaned_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
示例#22
0
def _get_semantic_content_for_page_coordinates(
        coordinates: LayoutPageCoordinates) -> SemanticContentWrapper:
    return SemanticFigure(layout_block=LayoutBlock.for_tokens(
        [LayoutToken(text='dummy', coordinates=coordinates)]))
示例#23
0
 def test_should_return_higherfont_without_previous_token(self):
     assert get_token_font_size_feature(
         previous_token=None,
         current_token=LayoutToken(
             '', font=LayoutFont(font_id='dummy'))) == 'HIGHERFONT'
示例#24
0
    ContextAwareLayoutTokenFeatures, DocumentFeaturesContext,
    ModelDataGenerator, LayoutModelData, feature_linear_scaling_int,
    _LINESCALE, get_str_bool_feature_value)

LOGGER = logging.getLogger(__name__)

NBSP = '\u00A0'


def format_feature_text(text: str) -> str:
    return re.sub(" |\t", NBSP, text.strip())


NBBINS_POSITION = 12

EMPTY_LAYOUT_TOKEN = LayoutToken('')
EMPTY_LAYOUT_LINE = LayoutLine([])


def get_block_status(line_index: int, line_count: int) -> str:
    return ('BLOCKSTART' if line_index == 0 else
            ('BLOCKEND' if line_index == line_count - 1 else 'BLOCKIN'))


def get_page_status(block_index: int, block_count: int,
                    is_first_block_token: bool,
                    is_last_block_token: bool) -> str:
    return ('PAGESTART' if block_index == 0 and is_first_block_token else
            ('PAGEEND' if block_index == block_count -
             1 and is_last_block_token else 'PAGEIN'))