def test_table_data_store(): # Testing with 'include_node_content' set to True. Should result in 3 columns pipeline = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True, node_tag_uuid='test')) pipeline.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data', include_node_content=True)) context = pipeline.run() compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json') # Testing with 'include_node_content' set to False. Should result in 2 columns pipeline2 = Pipeline( Document.from_kdxa( os.path.join(get_test_directory(), 'tongue_twister.kdxa'))) pipeline2.add_step( NodeTagger(selector='//*[contentRegex(".*flue.*")]', tag_to_apply='has_flue', node_only=True)) pipeline2.add_step( TagsToKeyValuePairExtractor(store_name='tagged_data_2', include_node_content=False)) context2 = pipeline2.run() compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
def get_test_document(): document = Document(DocumentMetadata()) node = document.create_node(node_type='foo') node.content = "cheese" document.content_node = node document.content_node.add_child( document.create_node(node_type='bar', content='fishstick')) return document
def test_tagged_content(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) all_nodes = document.content_node.select('//*[hasTag($entityName)]', {"entityName": "ORG"}) assert len(all_nodes) == 9 all_nodes = document.content_node.select('//p stream *[hasTag("ORG")] stream *[hasTag("ORG")]') assert len(all_nodes) == 7 all_nodes = document.content_node.select('//p intersect //*[hasTag("ORG")]') assert len(all_nodes) == 7 # Has any tag to start tagged_nodes = document.content_node.select('//*[hasTag()]') assert len(tagged_nodes) == 22 feature_nodes = document.content_node.select('//*[hasFeature()]') assert len(feature_nodes) == 32 all_nodes = document.content_node.select('//*[hasTag("ORG")]') assert len(all_nodes) == 9 union_nodes = document.content_node.select('//*[hasTag("ORG")] | //*[hasTag("ORG")]') assert len(union_nodes) == 18 node_match = all_nodes[0].select('*[tagRegex("O.*")]') assert len(node_match) == 1 node_match2 = all_nodes[0].select('*[tagRegex("CHE.*")]') assert len(node_match2) == 0
def test_uuid_select(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) node_uuid = document.select_first('//p').uuid print(document.select_first('//p').uuid) print(document.select_first('//p').content) assert document.select_first(f'//p[uuid({node_uuid})]').content == document.select_first('//p').content
def test_tag_regex(): document = Document.from_text("Hello World") results = document.content_node.select('*[typeRegex("te.*")]') assert len(results) == 1 assert results[0].content == "Hello World" results2 = document.content_node.select('*[typeRegex("chee.*")]') assert len(results2) == 0
def test_selector_regex(): document = Document.from_text("Hello World") results = document.select('hasTag() = false()') assert len(results) == 1 results = document.select('hasTag()') assert len(results) == 0 results = document.content_node.select('.') assert len(results) == 1 assert results[0].content == "Hello World" results = document.content_node.select('*[contentRegex("Hello.*")]') assert len(results) == 1 assert results[0].content == "Hello World" results2 = document.content_node.select('*[contentRegex("Cheese.*")]') assert len(results2) == 0 results = document.content_node.select('*[content()="Hello World"]') assert len(results) == 1 assert results[0].content == "Hello World" results2 = document.content_node.select('*[contentRegex("Cheese.*",true)]') assert len(results2) == 0
def test_html_rollup(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read()) # before rollup assert document.select('//a')[0].content == 'HSBC' assert document.select('//a')[1].content == 'Hang Seng Index' assert len( document.select('//*[contentRegex(".*Hang Seng Index.*")]') [0].get_content_parts()) == 1 # Collapse out all the <a> tags step = RollupTransformer(collapse_type_res=["a"]) step.process(document) # after rollup assert len(document.select('//a')) == 0 # see where the href rolled up assert document.select( '//*[contentRegex(".*Hang Seng Index.*")]' )[0].get_all_content( ) == 'The London-headquartered bank is a heavyweight component of the Hang Seng Index . HSBC shares in Hong Kong closed 2.78% lower.' assert len( document.select('//*[contentRegex(".*Hang Seng Index.*")]') [0].get_content_parts()) == 3
def test_node_only_tagging(): doc = Document.from_text("Hello World") doc.content_node.tag(node_only=True, content_re="Hello World", tag_to_apply="test") assert len(doc.content_node.get_tag_values("test")) == 1 doc.content_node.tag(node_only=True, content_re="Hello Cheese", tag_to_apply="test2") assert len(doc.content_node.get_tag_values("test2")) == 0
def test_basic_local_document_store(): lds = LocalDocumentStore(store_path='/tmp/s1', force_initialize=True) lds.put('my-doc', Document.from_text('hello!')) assert len(lds.list_objects()) == 1 lds2 = LocalDocumentStore(store_path='/tmp/s1') assert len(lds2.list_objects()) == 1
def test_html_rollup(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news.mdoc'), 'rb').read()) # Collapse out all the <a> tags step = Rollup(collapse_type_res=["a"]) result = step.process(document) print(DocumentRender(result).to_text())
def test_tag_key_value_include_exclude(): # Testing include parameter include_tags = ['DATE', 'LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 11 # Testing exclude parameter exclude_tags = ['DATE', 'LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', exclude=exclude_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 34 # Testing both include and exclude parameters include_tags = ['LOC'] exclude_tags = ['DATE'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags, exclude=exclude_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 5 # Testing both include - this should be the same as before as 'exclude' shouldn't have really done anything include_tags = ['LOC'] document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store', include=include_tags) context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 5
def test_tag_multiple_regex_matches(): doc_string = "Mary had a little lamb, little lamb, little lamb. Mary had a little lamb whose fleece was white as snow." document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False)) context = pipeline.run() tags = context.output_document.get_root().get_all_tags() assert len(tags) == 1 # we expect 4 tags to be applied, one for each instance of the word 'little' feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') assert type(feature_values) == list and len(feature_values) == 4 assert feature_values[2]['start'] == 37 assert feature_values[2]['end'] == 43 # Because we didn't pass in a tag_uuid to the NodeTagger, each of the feature values should have a different UUID features_uuids = list(set(dic['uuid'] for dic in feature_values)) assert len(features_uuids) == 4 # Run the multiple tag test again, but this time pass in a tag_uuid document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False, node_tag_uuid=str(uuid.uuid4()))) context = pipeline.run() # Now each of the feature values should have the same UUID feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') features_uuids = list(set(dic['uuid'] for dic in feature_values)) assert len(features_uuids) == 1 # Now test that tagging the entire node, rather than references within the node, only produce 1 feature document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True)) context = pipeline.run() tags = context.output_document.get_root().get_all_tags() assert len(tags) == 1 # we expect one tag to be applied and there to be no start or end value feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2') assert feature_values['start'] is None and feature_values['end'] is None
def test_parent_axis(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) first_paragraph = document.select('(//p)[0]') assert len(first_paragraph) == 1 assert len(first_paragraph[0].select('parent::div')) == 1 assert first_paragraph[0].select('parent::div')[0].node_type == 'div' link = document.select('//a')[0] assert link.select('parent::div')[0].node_type == 'div'
def test_instance_indexes(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) first_paragraph = document.select('(//p)[0]') assert len(first_paragraph) == 1 # Note this is important - the index here is not the position in the results # but the index of the node itself first_paragraph = document.select('//p[0]') assert len(first_paragraph) == 18
def test_fixed_tagging_remove(): doc = Document.from_text("Hello Philip") doc.content_node.tag('name', fixed_position=[6, 12]) assert doc.content_node.get_tag_values('name')[0] == 'Philip' doc.content_node.remove_tag('name') assert len(doc.content_node.get_tag_values('name')) == 0
def test_tag_key_value(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.mdoc'), 'rb').read()) # Collapse out all the <a> tags step = ExtractTagsToKeyValuePair(store_name='test_store') context = PipelineContext() result = step.process(document, context) print(context.get_store('test_store').rows)
def simplify_document(document: Document) -> dict: """ Args: document: Document: Returns: """ return {"content_node": simplify_node(document.get_root())}
def test_tag_key_value(): document = Document.from_msgpack( open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read()) step = TagsToKeyValuePairExtractor(store_name='test_store') context = PipelineContext() step.process(document, context) assert context.get_store('test_store').count() == 45 assert context.get_store('test_store').rows[14][0] == 'LOC' assert context.get_store('test_store').rows[14][1] == 'Europe'
def test_selector_complex_doc_1(): document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read()) all_nodes = document.content_node.select('//*') assert len(all_nodes) == 39 all_ps = document.content_node.select('//p') assert len(all_ps) == 18 for pos in range(18): selected_p = document.content_node.select(f'(//p)[{pos}]') assert len(selected_p) == 1 assert selected_p[0].uuid == all_ps[pos].uuid
def get_output_document(self, execution): final_reference = None for document_reference in execution.documentReferences: if document_reference.referenceType == 'OUTPUT': final_reference = document_reference if final_reference: doc = requests.get( f"{self.cloud_url}/api/sessions/{self.cloud_session.id}/executions/{execution.id}/documents/{final_reference.cloudDocument.id}", headers={"x-access-token": self.access_token}) return Document.from_msgpack(doc.content) else: return None
def test_tagging_issue_with_html(): kdxa_doc = Document.from_kdxa(get_test_directory() + 'tagging_issue.kdxa') all_content = kdxa_doc.content_node.get_all_content(strip=False) assert "IIJ" == all_content[707:710] # Now we tag the same location and try and get the content from the tag kdxa_doc.content_node.tag("test_tag", use_all_content=True, node_only=False, fixed_position=(707, 710)) node = kdxa_doc.select('//*[hasTag("test_tag")]')[0] feature = node.get_feature_value("tag", "test_tag") assert feature['value'] == 'IIJ' assert "IIJ" == kdxa_doc.select("//*[hasTag('test_tag')]")[0].get_all_content(strip=False)[ feature['start']:feature['end']]
def test_spatial_doc_sample_two(): # This test document and this portion of code is a snippet # from a test in the spatial actions tests. Adding this saved doc # and this section to ensure NodeTagger is tested. page_footer_re = r'Page \d+ of \d+$' document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') pipeline = Pipeline(document) pipeline.add_step( NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer')) pipeline.run() doc = pipeline.context.output_document assert doc.get_root() is not None
def test_fixed_tagging_with_child(): doc = Document.from_text("Hello") doc.content_node.add_child_content("text", "Philip") doc.content_node.add_child_content("text", "Dodds") # Hello Philip Dodds # 012345678901234567 assert doc.content_node.get_all_content(strip=False)[6:12] == 'Philip' assert doc.content_node.get_all_content(strip=False)[13:18] == 'Dodds' doc.content_node.tag('name', fixed_position=[6, 12], separator=" ") assert doc.content_node.get_tag_values('name', include_children=True)[0] == 'Philip' doc.content_node.tag('lastName', fixed_position=[13, 18], separator=" ") assert doc.content_node.get_tag_values('lastName', include_children=True)[0] == 'Dodds'
def test_selector_operators(): document = Document.from_text("Hello World") # combining multiple functions # Feeling crazy? assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*D")]')) == 0 # no dice - handle your capitalization correctly! :-) assert len(document.content_node.select('//*[typeRegex("te.*") or contentRegex("H.*D")]')) == 1 # This should obviously return zero nodes, as 'Howdy' isn't in the document assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("Howdy")]')) == 0 # What about this? There's an H and a W... assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H*W")]')) == 0 # Try that again, but modify the contentRegex assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*W")]')) == 1 # yea! # Another variation - we expect success assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*d")]')) == 1
def test_fax2tagging(): kdxa_doc = Document.from_kdxa(get_test_directory() + 'fax2.kdxa') kdxa_doc.content_node.tag("phone", use_all_content=True, fixed_position=[146, 158]) assert kdxa_doc.select("//*[hasTag('phone')]")[0].content == '785-368-1772' assert kdxa_doc.select("//*[hasTag('phone')]")[0].get_feature_value("tag", "phone")['value'] == '785-368-1772'
def test_tag_copy(): doc_string = "Mary had a little lamb, little lamb, little lamb. Mary had a little lamb whose fleece was white as snow." # data setup - creating a single tag with multiple matches...and then copying it document = Document.from_text(doc_string) pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False)) context = pipeline.run() # both existing and new tag names must be provided, and they must be different, test for that first. for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name=None, new_tag_name='NewTagNone') for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name='SIZE', new_tag_name=None) for n in document.select('//*[hasTag("SIZE")]'): n.copy_tag(existing_tag_name='SIZE', new_tag_name='SIZE') # verify that the only tag that exists is tag 'SIZE' and that there are only 4 feature values for it assert len(document.get_root().get_all_tags()) == 1 assert 'SIZE' in document.get_root().get_all_tags() # now, let's copy the SIZE tags and create new ones called LAMB_INFO # reusing the previously tagged document and testing out NodeTagCopy action pipeline = Pipeline(document) pipeline.add_step(NodeTagCopy(selector='//*[hasTag("SIZE")]', existing_tag_name='SIZE', new_tag_name='LAMB_INFO')) context = pipeline.run() # we should now have 4 feature values for 'LAMB_INFO' and 4 feature values for 'SIZE' - all with different UUIDs size_feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE') assert type(size_feature_values) == list and len(size_feature_values) == 4 lamb_info_feature_values = context.output_document.get_root().get_feature_values('tag', 'LAMB_INFO') assert type(lamb_info_feature_values) == list and len(lamb_info_feature_values) == 4 lamb_info_features_uuids = set(dic['uuid'] for dic in lamb_info_feature_values) assert len(list(lamb_info_features_uuids)) == 4 # Now test that tagging the entire node, rather than references within the node, only produce 1 feature document = Document.from_text(doc_string) # starting with a clean document pipeline = Pipeline(document) pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True)) context = pipeline.run() # now, let's copy the SIZE_2 tags and create new ones called LAMB_INFO (using node's tag_copy) for n in document.select('//*[hasTag("SIZE_2")]'): n.copy_tag(existing_tag_name='SIZE_2', new_tag_name='LAMB_INFO_2') # we should now have 1 feature values for 'LAMB_INFO_2' and 1 feature values for 'SIZE_2' size_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2') assert type(size_2_feature_values) != list lamb_info_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'LAMB_INFO_2') assert type(lamb_info_2_feature_values) != list # now we need to test that when features are related (indicated by the same tag_uuid), they remain related when copying document = Document.from_text(doc_string) # starting with a clean document pipeline = Pipeline(document) pipeline.add_step( NodeTagger(selector='//*', tag_to_apply='FLEECE_INFO', content_re=r'((white|snow))', node_only=False, node_tag_uuid=str(uuid.uuid4()))) context = pipeline.run() # now, let's copy the SIZE tags and create new ones called LAMB_INFO pipeline = Pipeline(document) # reusing the previously tagged document & testing out the NodeTagCopy action pipeline.add_step( NodeTagCopy(selector='//*[hasTag("FLEECE_INFO")]', existing_tag_name='FLEECE_INFO', new_tag_name='WOOL_INFO')) context = pipeline.run() # The feature values should have the same UUID - for both WOOL_INFO and FLEECE_INFO wool_values = context.output_document.get_root().get_feature_values('tag', 'WOOL_INFO') assert type(wool_values) == list and len(wool_values) == 2 wool_uuids = set(dic['uuid'] for dic in wool_values) assert len(list(wool_uuids)) == 1 fleece_info_values = context.output_document.get_root().get_feature_values('tag', 'FLEECE_INFO') assert type(fleece_info_values) == list and len(fleece_info_values) == 2
def test_rollup_of_pdf(): # first test - collapsing words and lines up to their common parent test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') # how many pre-rollup lines? assert len(test_doc.select('//line')) == 3824 # how many pre-rollup words? assert len(test_doc.select('//word')) == 52903 # how many pre-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the pre-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["word", "line"], separator_character=' ')) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # second test - just collapse the line up to its parent (content-area) - roll up the line's children test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer(collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # how many post-rollup lines? assert len(test_doc.select('//line')) == 0 # how many post-rollup words? assert len(test_doc.select('//word')) == 0 # how many post-rollup content-areas? assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235 # third test - select specific nodes in which we'll do the roll ups test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa') node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]" # verify we have 3 nodes match this selector node_matches = test_doc.select(node_selector) assert len(node_matches) == 3 # before we rollup, let's make sure the matching nodes conform to known expectations assert len(node_matches[0].select('//word')) == 2 assert len(node_matches[0].select('//line')) == 1 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 2 assert len(node_matches[1].select('//line')) == 1 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 71 assert len(node_matches[2].select('//line')) == 6 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 rollup_pipeline = Pipeline(test_doc) rollup_pipeline.add_step( RollupTransformer( selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]", collapse_type_res=["line"], separator_character=' ', get_all_content=True)) rollup_pipeline.run() collapsed_doc = rollup_pipeline.context.output_document # check those matching nodes - we shouldn't have any words or lines, but # all other node_types should exist and the content should stay the same. assert len(node_matches[0].select('//word')) == 0 assert len(node_matches[0].select('//line')) == 0 assert len(node_matches[0].select('//content-area')) == 1 assert len(node_matches[0].get_all_content()) == 14 assert len(node_matches[1].select('//word')) == 0 assert len(node_matches[1].select('//line')) == 0 assert len(node_matches[1].select('//content-area')) == 1 assert len(node_matches[1].get_all_content()) == 14 assert len(node_matches[2].select('//word')) == 0 assert len(node_matches[2].select('//line')) == 0 assert len(node_matches[2].select('//content-area')) == 1 assert len(node_matches[2].get_all_content()) == 500 # how many post-rollup lines? (still have some lines, but fewer than we started with) assert len(test_doc.select('//line')) == 3816 # how many post-rollup words? (still have some words, but fewer than we started with) assert len(test_doc.select('//word')) == 52828 # how many post-rollup content-areas? (same number of content-areas) assert len(test_doc.select('//content-area')) == 817 # what is the post-rollup length of ALL the content in the document? assert len(test_doc.get_root().get_all_content()) == 329792 # verify that we can collapse line nodes AND include their children assert len( collapsed_doc.select("//content-area")[12].get_all_content()) == 235
def test_parent_child(): document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') page = document.select('//page')[0] assert page.select('//line')[0].select_first('parent::page').uuid == page.uuid
def test_selector_deep(): document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa') assert len(document.select('//page')[0].select('//line')) == 63 assert len(document.select('//line')) == 3143
def test_selector_2(): document = Document.from_text("Hello World") results = document.content_node.select('*') assert len(results) == 1 assert results[0].content == "Hello World"