Пример #1
0
def load_lxml_structured_document(filename, page_range=None):
    with FileSystems.open(filename) as f:
        structured_document = LxmlStructuredDocument(etree.parse(f).getroot())
        if page_range:
            structured_document = LxmlStructuredDocument(
                E.DOCUMENT(
                    *structured_document.get_pages()[max(0, page_range[0] -
                                                         1):page_range[1]]))
        return structured_document
 def test_should_call_save_file_content(self):
     m = structured_document_saver
     root = E.DOCUMENT()
     with patch.object(m, 'save_file_content') as save_file_content:
         with patch.object(m, 'etree') as etree:
             save_lxml_structured_document(FILE_1, LxmlStructuredDocument(root))
             save_file_content.assert_called_with(FILE_1, etree.tostring(root))
Пример #3
0
 def test_should_not_fail_setting_empty_tag_to_none(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, None)
     doc.set_tag(token, None, scope=SCOPE_1)
     assert doc.get_tag(token) is None
     assert doc.get_tag(token, scope=SCOPE_1) is None
Пример #4
0
 def test_should_calculate_bounding_box_of_page_without_xy(self):
     page = E.PAGE({
         'width': '100',
         'height': '101'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(page))
     assert doc.get_bounding_box(page) == BoundingBox(0, 0, 100, 101)
Пример #5
0
 def test_should_call_save_lxml_structured_document(self):
     structured_document = LxmlStructuredDocument(E.DOCUMENT)
     m = structured_document_saver
     with patch.object(m, 'save_lxml_structured_document'
                       ) as save_lxml_structured_document_mock:
         save_structured_document(FILE_1, structured_document)
         save_lxml_structured_document_mock.assert_called_with(
             FILE_1, structured_document)
Пример #6
0
 def test_should_return_all_tag_by_scope(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1)
     doc.set_tag(token, TAG_2, scope=SCOPE_1)
     assert doc.get_tag(token) == TAG_1
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_2
     assert doc.get_tag_by_scope(token) == {None: TAG_1, SCOPE_1: TAG_2}
Пример #7
0
def convert_pdf_bytes_to_structured_document(pdf_content,
                                             path=None,
                                             page_range=None):
    return LxmlStructuredDocument(
        etree.parse(
            BytesIO(
                convert_pdf_bytes_to_lxml(pdf_content,
                                          path=path,
                                          page_range=page_range))))
Пример #8
0
 def test_should_calculate_default_bounding_box(self):
     token = E.TOKEN({
         'x': '10',
         'y': '11',
         'width': '100',
         'height': '101'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token))))
     assert doc.get_bounding_box(token) == BoundingBox(10, 11, 100, 101)
Пример #9
0
 def test_should_find_lines_of_page_with_blocks(self):
     lines = [E.TEXT(), E.TEXT()]
     page = E.PAGE(E.BLOCK(*lines))
     doc = LxmlStructuredDocument(
         E.DOCUMENT(
             page,
             # add another page just for effect
             E.PAGE(E.BLOCK(E.TEXT()))))
     assert list(doc.get_lines_of_page(page)) == lines
Пример #10
0
 def test_should_be_able_to_set_bounding_box(self):
     bounding_box = BoundingBox(10, 11, 100, 101)
     token = E.TOKEN({
         'x': '20',
         'y': '21',
         'width': '200',
         'height': '201'
     })
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.TEXT(token))))
     doc.set_bounding_box(token, bounding_box)
     assert doc.get_bounding_box(token) == bounding_box
Пример #11
0
 def test_should_find_pages(self):
     pages = [
         E.PAGE(),
         E.PAGE()
     ]
     doc = LxmlStructuredDocument(
         E.DOCUMENT(
             *pages
         )
     )
     assert list(doc.get_pages()) == pages
Пример #12
0
 def test_should_find_tokens_of_line(self):
     tokens = [
         E.TOKEN(),
         E.TOKEN()
     ]
     line = E.TEXT(*tokens)
     doc = LxmlStructuredDocument(
         E.DOCUMENT(
             E.PAGE(
                 line,
                 E.TEXT(E.TOKEN)
             )
         )
     )
     assert list(doc.get_tokens_of_line(line)) == tokens
Пример #13
0
 def test_should_set_tag_with_level(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1, level=2)
     assert doc.get_tag(token, level=2) == TAG_1
     assert doc.get_tag(token) is None
Пример #14
0
 def test_should_set_tag_with_scope(self):
     token = E.TEXT()
     doc = LxmlStructuredDocument(E.DOCUMENT(E.PAGE(E.BLOCK(token))))
     doc.set_tag(token, TAG_1, scope=SCOPE_1)
     assert doc.get_tag(token, scope=SCOPE_1) == TAG_1
     assert doc.get_tag(token) is None