def test_should_calculate_bounding_box_of_page_without_xy(self): page = E.PAGE({ 'width': '100', 'height': '101' }) doc = LxmlStructuredDocument(E.DOCUMENT(page)) assert doc.get_bounding_box(page) == BoundingBox(0, 0, 100, 101)
def load_lxml_structured_document(filename, page_range=None): with FileSystems.open(filename) as f: structured_document = LxmlStructuredDocument(etree.parse(f).getroot()) if page_range: structured_document = LxmlStructuredDocument( E.DOCUMENT( *structured_document.get_pages()[max(0, page_range[0] - 1):page_range[1]])) return structured_document
def test_should_calculate_default_bounding_box(self): token = E.TOKEN({ 'x': '10', 'y': '11', 'width': '100', 'height': '101' }) doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token)))) assert doc.get_bounding_box(token) == BoundingBox(10, 11, 100, 101)
def test_should_find_lines_of_page_with_blocks(self): lines = [E.TEXT(), E.TEXT()] page = E.PAGE(E.BLOCK(*lines)) doc = LxmlStructuredDocument( E.DOCUMENT( page, # add another page just for effect E.PAGE(E.BLOCK(E.TEXT())))) assert list(doc.get_lines_of_page(page)) == lines
def test_should_be_able_to_set_bounding_box(self): bounding_box = BoundingBox(10, 11, 100, 101) token = E.TOKEN({ 'x': '20', 'y': '21', 'width': '200', 'height': '201' }) doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token)))) doc.set_bounding_box(token, bounding_box) assert doc.get_bounding_box(token) == bounding_box
def test_should_find_pages(self): pages = [ E.PAGE(), E.PAGE() ] doc = LxmlStructuredDocument( E.DOCUMENT( *pages ) ) assert list(doc.get_pages()) == pages
def test_should_return_all_tag_by_scope(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1) doc.set_tag(token, TAG_2, scope=SCOPE_1) assert doc.get_tag(token) == TAG_1 assert doc.get_tag(token, scope=SCOPE_1) == TAG_2 assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
def test_should_not_fail_setting_empty_tag_to_none(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, None) doc.set_tag(token, None, scope=SCOPE_1) assert doc.get_tag(token) is None assert doc.get_tag(token, scope=SCOPE_1) is None
def test_should_find_tokens_of_line(self): tokens = [ E.TOKEN(), E.TOKEN() ] line = E.TEXT(*tokens) doc = LxmlStructuredDocument( E.DOCUMENT( E.PAGE( line, E.TEXT(E.TOKEN) ) ) ) assert list(doc.get_tokens_of_line(line)) == tokens
def test_should_call_save_file_content(self): m = structured_document_saver root = E.DOCUMENT() with patch.object(m, 'save_file_content') as save_file_content: with patch.object(m, 'etree') as etree: save_lxml_structured_document(FILE_1, LxmlStructuredDocument(root)) save_file_content.assert_called_with(FILE_1, etree.tostring(root))
def test_should_call_save_lxml_structured_document(self): structured_document = LxmlStructuredDocument(E.DOCUMENT) m = structured_document_saver with patch.object(m, 'save_lxml_structured_document' ) as save_lxml_structured_document_mock: save_structured_document(FILE_1, structured_document) save_lxml_structured_document_mock.assert_called_with( FILE_1, structured_document)
def convert_pdf_bytes_to_structured_document(pdf_content, path=None, page_range=None): return LxmlStructuredDocument( etree.parse( BytesIO( convert_pdf_bytes_to_lxml(pdf_content, path=path, page_range=page_range))))
def test_should_set_tag_with_level(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1, level=2) assert doc.get_tag(token, level=2) == TAG_1 assert doc.get_tag(token) is None
def test_should_set_tag_with_scope(self): token = E.TEXT() doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token)))) doc.set_tag(token, TAG_1, scope=SCOPE_1) assert doc.get_tag(token, scope=SCOPE_1) == TAG_1 assert doc.get_tag(token) is None