def test_table_data_store(): # Testing with 'include_node_content' set to True. Should result in 3 columns pipeline = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True, node_tag_uuid='test')) pipeline.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data', include_node_content=True)) context = pipeline.run() compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json') # Testing with 'include_node_content' set to False. Should result in 2 columns pipeline2 = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline2.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True)) pipeline2.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data_2', include_node_content=False)) context2 = pipeline2.run() compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
def test_tagging_issue_with_html(): kdxa_doc = Document.from_kdxa(get_test_directory() + 'tagging_issue.kdxa') all_content = kdxa_doc.content_node.get_all_content(strip=False) assert "IIJ" == all_content[707:710] # Now we tag the same location and try and get the content from the tag kdxa_doc.content_node.tag("test_tag", use_all_content=True, node_only=False, fixed_position=(707, 710)) node = kdxa_doc.select('//*[hasTag("test_tag")]')[0] feature = node.get_feature_value("tag", "test_tag") assert feature['value'] == 'IIJ' assert "IIJ" == kdxa_doc.select("//*[hasTag('test_tag')]")[0].get_all_content(strip=False)[ feature['start']:feature['end']]
def test_spatial_doc_sample_two(): # This test document and this portion of code is a snippet # from a test in the spatial actions tests. Adding this saved doc # and this section to ensure NodeTagger is tested. page_footer_re = r'Page \d+ of \d+$' document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') pipeline = Pipeline(document) pipeline.add_step( NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer')) pipeline.run() doc = pipeline.context.output_document assert doc.get_root() is not None
def test_rollup_of_pdf(): # first test - collapsing words and lines up to their common parent test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') # how many pre-rollup lines? assert len(test_doc.select('//line')) == 3824 # how many pre-rollup words? assert len(test_doc.select('//word')) == 52903 # how many pre-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the pre-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["word", "line"], separator_character=' ')) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # second test - just collapse the line up to its parent (content-area) - roll up the line's children test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # third test - select specific nodes in which we'll do the roll ups test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]" # verify we have 3 nodes match this selector node_matches = test_doc.select(node_selector) assert len(node_matches) == 3 # before we rollup, let's make sure the matching nodes conform to known expectations assert len(node_matches[0].select('//word')) == 2 assert len(node_matches[0].select('//line')) == 1 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 2 assert len(node_matches[1].select('//line')) == 1 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 71 assert len(node_matches[2].select('//line')) == 6 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer( selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]", collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # check those matching nodes - we shouldn't have any words or lines, but # all other node_types should exist and the content should stay the same. assert len(node_matches[0].select('//word')) == 0 assert len(node_matches[0].select('//line')) == 0 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 0 assert len(node_matches[1].select('//line')) == 0 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 0 assert len(node_matches[2].select('//line')) == 0 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 # how many post-rollup lines? (still have some lines, but fewer than we started with) assert len(test_doc.select('//line')) == 3816 # how many post-rollup words? (still have some words, but fewer than we started with) assert len(test_doc.select('//word')) == 52828 # how many post-rollup content-areas? (same number of content-areas) assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235
def test_parent_child(): document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') page = document.select('//page')[0] assert page.select('//line')[0].select_first('parent::page').uuid == page.uuid
def test_selector_deep(): document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') assert len(document.select('//page')[0].select('//line')) == 63 assert len(document.select('//line')) == 3143
def test_fax2tagging(): kdxa_doc = Document.from_kdxa(get_test_directory() + 'fax2.kdxa') kdxa_doc.content_node.tag("phone", use_all_content=True, fixed_position=[146, 158]) assert kdxa_doc.select("//*[hasTag('phone')]")[0].content == '785-368-1772' assert kdxa_doc.select("//*[hasTag('phone')]")[0].get_feature_value("tag", "phone")['value'] == '785-368-1772'