예제 #1
0
def test_table_data_store():
    # Testing with 'include_node_content' set to True.  Should result in 3 columns
    pipeline = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True,
                   node_tag_uuid='test'))
    pipeline.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data',
                                    include_node_content=True))
    context = pipeline.run()

    compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json')

    # Testing with 'include_node_content' set to False.  Should result in 2 columns
    pipeline2 = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline2.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True))
    pipeline2.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data_2',
                                    include_node_content=False))
    context2 = pipeline2.run()

    compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
예제 #2
0
def get_test_document():
    document = Document(DocumentMetadata())
    node = document.create_node(node_type='foo')
    node.content = "cheese"
    document.content_node = node

    document.content_node.add_child(
        document.create_node(node_type='bar', content='fishstick'))
    return document
예제 #3
0
def test_tagged_content():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())

    all_nodes = document.content_node.select('//*[hasTag($entityName)]', {"entityName": "ORG"})
    assert len(all_nodes) == 9

    all_nodes = document.content_node.select('//p stream *[hasTag("ORG")] stream *[hasTag("ORG")]')
    assert len(all_nodes) == 7

    all_nodes = document.content_node.select('//p intersect //*[hasTag("ORG")]')
    assert len(all_nodes) == 7

    # Has any tag to start
    tagged_nodes = document.content_node.select('//*[hasTag()]')
    assert len(tagged_nodes) == 22

    feature_nodes = document.content_node.select('//*[hasFeature()]')
    assert len(feature_nodes) == 32

    all_nodes = document.content_node.select('//*[hasTag("ORG")]')
    assert len(all_nodes) == 9

    union_nodes = document.content_node.select('//*[hasTag("ORG")] | //*[hasTag("ORG")]')
    assert len(union_nodes) == 18

    node_match = all_nodes[0].select('*[tagRegex("O.*")]')
    assert len(node_match) == 1

    node_match2 = all_nodes[0].select('*[tagRegex("CHE.*")]')
    assert len(node_match2) == 0
예제 #4
0
def test_uuid_select():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    node_uuid = document.select_first('//p').uuid
    print(document.select_first('//p').uuid)
    print(document.select_first('//p').content)

    assert document.select_first(f'//p[uuid({node_uuid})]').content == document.select_first('//p').content
예제 #5
0
def test_tag_regex():
    document = Document.from_text("Hello World")
    results = document.content_node.select('*[typeRegex("te.*")]')
    assert len(results) == 1
    assert results[0].content == "Hello World"
    results2 = document.content_node.select('*[typeRegex("chee.*")]')
    assert len(results2) == 0
예제 #6
0
def test_selector_regex():
    document = Document.from_text("Hello World")

    results = document.select('hasTag() = false()')
    assert len(results) == 1

    results = document.select('hasTag()')
    assert len(results) == 0

    results = document.content_node.select('.')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results = document.content_node.select('*[contentRegex("Hello.*")]')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results2 = document.content_node.select('*[contentRegex("Cheese.*")]')
    assert len(results2) == 0

    results = document.content_node.select('*[content()="Hello World"]')
    assert len(results) == 1
    assert results[0].content == "Hello World"

    results2 = document.content_node.select('*[contentRegex("Cheese.*",true)]')
    assert len(results2) == 0
예제 #7
0
def test_html_rollup():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read())

    # before rollup
    assert document.select('//a')[0].content == 'HSBC'
    assert document.select('//a')[1].content == 'Hang Seng Index'
    assert len(
        document.select('//*[contentRegex(".*Hang Seng Index.*")]')
        [0].get_content_parts()) == 1

    # Collapse out all the <a> tags
    step = RollupTransformer(collapse_type_res=["a"])
    step.process(document)

    # after rollup
    assert len(document.select('//a')) == 0
    # see where the href rolled up
    assert document.select(
        '//*[contentRegex(".*Hang Seng Index.*")]'
    )[0].get_all_content(
    ) == 'The London-headquartered bank is a heavyweight component of the  Hang Seng Index . HSBC shares in Hong Kong closed 2.78% lower.'
    assert len(
        document.select('//*[contentRegex(".*Hang Seng Index.*")]')
        [0].get_content_parts()) == 3
예제 #8
0
def test_node_only_tagging():
    doc = Document.from_text("Hello World")

    doc.content_node.tag(node_only=True, content_re="Hello World", tag_to_apply="test")
    assert len(doc.content_node.get_tag_values("test")) == 1

    doc.content_node.tag(node_only=True, content_re="Hello Cheese", tag_to_apply="test2")
    assert len(doc.content_node.get_tag_values("test2")) == 0
예제 #9
0
def test_basic_local_document_store():
    lds = LocalDocumentStore(store_path='/tmp/s1', force_initialize=True)
    lds.put('my-doc', Document.from_text('hello!'))

    assert len(lds.list_objects()) == 1

    lds2 = LocalDocumentStore(store_path='/tmp/s1')
    assert len(lds2.list_objects()) == 1
예제 #10
0
def test_html_rollup():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news.mdoc'), 'rb').read())

    # Collapse out all the <a> tags
    step = Rollup(collapse_type_res=["a"])
    result = step.process(document)
    print(DocumentRender(result).to_text())
예제 #11
0
def test_tag_key_value_include_exclude():
    # Testing include parameter
    include_tags = ['DATE', 'LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 11

    # Testing exclude parameter
    exclude_tags = ['DATE', 'LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       exclude=exclude_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 34

    # Testing both include and exclude parameters
    include_tags = ['LOC']
    exclude_tags = ['DATE']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags,
                                       exclude=exclude_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 5

    # Testing both include - this should be the same as before as 'exclude' shouldn't have really done anything
    include_tags = ['LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 5
예제 #12
0
def test_tag_multiple_regex_matches():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."

    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect 4 tags to be applied, one for each instance of the word 'little'
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(feature_values) == list and len(feature_values) == 4
    assert feature_values[2]['start'] == 37
    assert feature_values[2]['end'] == 43

    # Because we didn't pass in a tag_uuid to the NodeTagger, each of the feature values should have a different UUID
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 4

    # Run the multiple tag test again, but this time pass in a tag_uuid
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False,
                                 node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # Now each of the feature values should have the same UUID
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 1

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect one tag to be applied and there to be no start or end value
    feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert feature_values['start'] is None and feature_values['end'] is None
예제 #13
0
def test_parent_axis():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    first_paragraph = document.select('(//p)[0]')
    assert len(first_paragraph) == 1
    assert len(first_paragraph[0].select('parent::div')) == 1
    assert first_paragraph[0].select('parent::div')[0].node_type == 'div'

    link = document.select('//a')[0]
    assert link.select('parent::div')[0].node_type == 'div'
예제 #14
0
def test_instance_indexes():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    first_paragraph = document.select('(//p)[0]')
    assert len(first_paragraph) == 1

    # Note this is important - the index here is not the position in the results
    # but the index of the node itself
    first_paragraph = document.select('//p[0]')
    assert len(first_paragraph) == 18
예제 #15
0
def test_fixed_tagging_remove():
    doc = Document.from_text("Hello Philip")
    doc.content_node.tag('name', fixed_position=[6, 12])

    assert doc.content_node.get_tag_values('name')[0] == 'Philip'

    doc.content_node.remove_tag('name')

    assert len(doc.content_node.get_tag_values('name')) == 0
예제 #16
0
def test_tag_key_value():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.mdoc'),
             'rb').read())

    # Collapse out all the <a> tags
    step = ExtractTagsToKeyValuePair(store_name='test_store')
    context = PipelineContext()
    result = step.process(document, context)
    print(context.get_store('test_store').rows)
예제 #17
0
def simplify_document(document: Document) -> dict:
    """

    Args:
      document: Document:

    Returns:

    """
    return {"content_node": simplify_node(document.get_root())}
예제 #18
0
def test_tag_key_value():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store')
    context = PipelineContext()
    step.process(document, context)

    assert context.get_store('test_store').count() == 45
    assert context.get_store('test_store').rows[14][0] == 'LOC'
    assert context.get_store('test_store').rows[14][1] == 'Europe'
예제 #19
0
def test_selector_complex_doc_1():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read())
    all_nodes = document.content_node.select('//*')
    assert len(all_nodes) == 39

    all_ps = document.content_node.select('//p')
    assert len(all_ps) == 18

    for pos in range(18):
        selected_p = document.content_node.select(f'(//p)[{pos}]')
        assert len(selected_p) == 1
        assert selected_p[0].uuid == all_ps[pos].uuid
예제 #20
0
파일: kodexa.py 프로젝트: fossabot/kodexa
    def get_output_document(self, execution):
        final_reference = None
        for document_reference in execution.documentReferences:
            if document_reference.referenceType == 'OUTPUT':
                final_reference = document_reference

        if final_reference:
            doc = requests.get(
                f"{self.cloud_url}/api/sessions/{self.cloud_session.id}/executions/{execution.id}/documents/{final_reference.cloudDocument.id}",
                headers={"x-access-token": self.access_token})
            return Document.from_msgpack(doc.content)
        else:
            return None
예제 #21
0
def test_tagging_issue_with_html():
    kdxa_doc = Document.from_kdxa(get_test_directory() + 'tagging_issue.kdxa')

    all_content = kdxa_doc.content_node.get_all_content(strip=False)
    assert "IIJ" == all_content[707:710]

    # Now we tag the same location and try and get the content from the tag
    kdxa_doc.content_node.tag("test_tag", use_all_content=True, node_only=False, fixed_position=(707, 710))

    node = kdxa_doc.select('//*[hasTag("test_tag")]')[0]
    feature = node.get_feature_value("tag", "test_tag")
    assert feature['value'] == 'IIJ'
    assert "IIJ" == kdxa_doc.select("//*[hasTag('test_tag')]")[0].get_all_content(strip=False)[
                    feature['start']:feature['end']]
예제 #22
0
def test_spatial_doc_sample_two():
    # This test document and this portion of code is a snippet
    # from a test in the spatial actions tests.  Adding this saved doc
    # and this section to ensure NodeTagger is tested.
    page_footer_re = r'Page \d+ of \d+$'
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    pipeline = Pipeline(document)

    pipeline.add_step(
        NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer'))
    pipeline.run()

    doc = pipeline.context.output_document

    assert doc.get_root() is not None
예제 #23
0
def test_fixed_tagging_with_child():
    doc = Document.from_text("Hello")
    doc.content_node.add_child_content("text", "Philip")
    doc.content_node.add_child_content("text", "Dodds")

    # Hello Philip Dodds
    # 012345678901234567

    assert doc.content_node.get_all_content(strip=False)[6:12] == 'Philip'
    assert doc.content_node.get_all_content(strip=False)[13:18] == 'Dodds'

    doc.content_node.tag('name', fixed_position=[6, 12], separator=" ")

    assert doc.content_node.get_tag_values('name', include_children=True)[0] == 'Philip'
    doc.content_node.tag('lastName', fixed_position=[13, 18], separator=" ")

    assert doc.content_node.get_tag_values('lastName', include_children=True)[0] == 'Dodds'
예제 #24
0
def test_selector_operators():
    document = Document.from_text("Hello World")

    # combining multiple functions

    # Feeling crazy?
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*D")]')) == 0
    # no dice - handle your capitalization correctly! :-)

    assert len(document.content_node.select('//*[typeRegex("te.*") or contentRegex("H.*D")]')) == 1

    # This should obviously return zero nodes, as 'Howdy' isn't in the document
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("Howdy")]')) == 0

    # What about this?  There's an H and a W...
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H*W")]')) == 0

    # Try that again, but modify the contentRegex
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*W")]')) == 1
    # yea!

    # Another variation - we expect success
    assert len(document.content_node.select('//*[typeRegex("te.*") and contentRegex("H.*d")]')) == 1
예제 #25
0
def test_fax2tagging():
    kdxa_doc = Document.from_kdxa(get_test_directory() + 'fax2.kdxa')

    kdxa_doc.content_node.tag("phone", use_all_content=True, fixed_position=[146, 158])
    assert kdxa_doc.select("//*[hasTag('phone')]")[0].content == '785-368-1772'
    assert kdxa_doc.select("//*[hasTag('phone')]")[0].get_feature_value("tag", "phone")['value'] == '785-368-1772'
예제 #26
0
def test_tag_copy():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."
    # data setup - creating a single tag with multiple matches...and then copying it
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    # both existing and new tag names must be provided, and they must be different, test for that first.
    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name=None, new_tag_name='NewTagNone')

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name=None)

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name='SIZE')

    # verify that the only tag that exists is tag 'SIZE' and that there are only 4 feature values for it
    assert len(document.get_root().get_all_tags()) == 1
    assert 'SIZE' in document.get_root().get_all_tags()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    # reusing the previously tagged document and testing out NodeTagCopy action
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagCopy(selector='//*[hasTag("SIZE")]', existing_tag_name='SIZE', new_tag_name='LAMB_INFO'))
    context = pipeline.run()

    # we should now have 4 feature values for 'LAMB_INFO' and 4 feature values for 'SIZE' - all with different UUIDs
    size_feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(size_feature_values) == list and len(size_feature_values) == 4

    lamb_info_feature_values = context.output_document.get_root().get_feature_values('tag', 'LAMB_INFO')
    assert type(lamb_info_feature_values) == list and len(lamb_info_feature_values) == 4
    lamb_info_features_uuids = set(dic['uuid'] for dic in lamb_info_feature_values)
    assert len(list(lamb_info_features_uuids)) == 4

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    # now, let's copy the SIZE_2 tags and create new ones called LAMB_INFO (using node's tag_copy)
    for n in document.select('//*[hasTag("SIZE_2")]'):
        n.copy_tag(existing_tag_name='SIZE_2', new_tag_name='LAMB_INFO_2')

    # we should now have 1 feature values for 'LAMB_INFO_2' and 1 feature values for 'SIZE_2'
    size_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert type(size_2_feature_values) != list
    lamb_info_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'LAMB_INFO_2')
    assert type(lamb_info_2_feature_values) != list

    # now we need to test that when features are related (indicated by the same tag_uuid), they remain related when copying
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(
        NodeTagger(selector='//*', tag_to_apply='FLEECE_INFO', content_re=r'((white|snow))', node_only=False,
                   node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    pipeline = Pipeline(document)  # reusing the previously tagged document & testing out the NodeTagCopy action
    pipeline.add_step(
        NodeTagCopy(selector='//*[hasTag("FLEECE_INFO")]', existing_tag_name='FLEECE_INFO', new_tag_name='WOOL_INFO'))
    context = pipeline.run()

    # The feature values should have the same UUID - for both WOOL_INFO and FLEECE_INFO
    wool_values = context.output_document.get_root().get_feature_values('tag', 'WOOL_INFO')
    assert type(wool_values) == list and len(wool_values) == 2
    wool_uuids = set(dic['uuid'] for dic in wool_values)
    assert len(list(wool_uuids)) == 1

    fleece_info_values = context.output_document.get_root().get_feature_values('tag', 'FLEECE_INFO')
    assert type(fleece_info_values) == list and len(fleece_info_values) == 2
예제 #27
0
def test_rollup_of_pdf():
    # first test - collapsing words and lines up to their common parent
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    # how many pre-rollup lines?
    assert len(test_doc.select('//line')) == 3824
    # how many pre-rollup words?
    assert len(test_doc.select('//word')) == 52903
    # how many pre-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the pre-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["word", "line"],
                          separator_character=' '))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # second test - just collapse the line up to its parent (content-area) - roll up the line's children
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["line"],
                          separator_character=' ',
                          get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # third test - select specific nodes in which we'll do the roll ups
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]"

    # verify we have 3 nodes match this selector
    node_matches = test_doc.select(node_selector)
    assert len(node_matches) == 3

    # before we rollup, let's make sure the matching nodes conform to known expectations
    assert len(node_matches[0].select('//word')) == 2
    assert len(node_matches[0].select('//line')) == 1
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 2
    assert len(node_matches[1].select('//line')) == 1
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 71
    assert len(node_matches[2].select('//line')) == 6
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(
            selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]",
            collapse_type_res=["line"],
            separator_character=' ',
            get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # check those matching nodes - we shouldn't have any words or lines, but
    # all other node_types should exist and the content should stay the same.
    assert len(node_matches[0].select('//word')) == 0
    assert len(node_matches[0].select('//line')) == 0
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 0
    assert len(node_matches[1].select('//line')) == 0
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 0
    assert len(node_matches[2].select('//line')) == 0
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    # how many post-rollup lines? (still have some lines, but fewer than we started with)
    assert len(test_doc.select('//line')) == 3816
    # how many post-rollup words? (still have some words, but fewer than we started with)
    assert len(test_doc.select('//word')) == 52828
    # how many post-rollup content-areas? (same number of content-areas)
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235
예제 #28
0
def test_parent_child():
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    page = document.select('//page')[0]
    assert page.select('//line')[0].select_first('parent::page').uuid == page.uuid
예제 #29
0
def test_selector_deep():
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    assert len(document.select('//page')[0].select('//line')) == 63
    assert len(document.select('//line')) == 3143
예제 #30
0
def test_selector_2():
    document = Document.from_text("Hello World")
    results = document.content_node.select('*')
    assert len(results) == 1
    assert results[0].content == "Hello World"