示例#1
0
def test_uuid_select():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    node_uuid = document.select_first('//p').uuid
    print(document.select_first('//p').uuid)
    print(document.select_first('//p').content)

    assert document.select_first(f'//p[uuid({node_uuid})]').content == document.select_first('//p').content
示例#2
0
def test_html_rollup():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read())

    # before rollup
    assert document.select('//a')[0].content == 'HSBC'
    assert document.select('//a')[1].content == 'Hang Seng Index'
    assert len(
        document.select('//*[contentRegex(".*Hang Seng Index.*")]')
        [0].get_content_parts()) == 1

    # Collapse out all the <a> tags
    step = RollupTransformer(collapse_type_res=["a"])
    step.process(document)

    # after rollup
    assert len(document.select('//a')) == 0
    # see where the href rolled up
    assert document.select(
        '//*[contentRegex(".*Hang Seng Index.*")]'
    )[0].get_all_content(
    ) == 'The London-headquartered bank is a heavyweight component of the  Hang Seng Index . HSBC shares in Hong Kong closed 2.78% lower.'
    assert len(
        document.select('//*[contentRegex(".*Hang Seng Index.*")]')
        [0].get_content_parts()) == 3
示例#3
0
def test_tagged_content():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())

    all_nodes = document.content_node.select('//*[hasTag($entityName)]', {"entityName": "ORG"})
    assert len(all_nodes) == 9

    all_nodes = document.content_node.select('//p stream *[hasTag("ORG")] stream *[hasTag("ORG")]')
    assert len(all_nodes) == 7

    all_nodes = document.content_node.select('//p intersect //*[hasTag("ORG")]')
    assert len(all_nodes) == 7

    # Has any tag to start
    tagged_nodes = document.content_node.select('//*[hasTag()]')
    assert len(tagged_nodes) == 22

    feature_nodes = document.content_node.select('//*[hasFeature()]')
    assert len(feature_nodes) == 32

    all_nodes = document.content_node.select('//*[hasTag("ORG")]')
    assert len(all_nodes) == 9

    union_nodes = document.content_node.select('//*[hasTag("ORG")] | //*[hasTag("ORG")]')
    assert len(union_nodes) == 18

    node_match = all_nodes[0].select('*[tagRegex("O.*")]')
    assert len(node_match) == 1

    node_match2 = all_nodes[0].select('*[tagRegex("CHE.*")]')
    assert len(node_match2) == 0
示例#4
0
def test_tag_key_value_include_exclude():
    # Testing include parameter
    include_tags = ['DATE', 'LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 11

    # Testing exclude parameter
    exclude_tags = ['DATE', 'LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       exclude=exclude_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 34

    # Testing both include and exclude parameters
    include_tags = ['LOC']
    exclude_tags = ['DATE']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags,
                                       exclude=exclude_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 5

    # Testing both include - this should be the same as before as 'exclude' shouldn't have really done anything
    include_tags = ['LOC']
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store',
                                       include=include_tags)
    context = PipelineContext()
    step.process(document, context)
    assert context.get_store('test_store').count() == 5
示例#5
0
def test_html_rollup():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news.mdoc'), 'rb').read())

    # Collapse out all the <a> tags
    step = Rollup(collapse_type_res=["a"])
    result = step.process(document)
    print(DocumentRender(result).to_text())
示例#6
0
def test_instance_indexes():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    first_paragraph = document.select('(//p)[0]')
    assert len(first_paragraph) == 1

    # Note this is important - the index here is not the position in the results
    # but the index of the node itself
    first_paragraph = document.select('//p[0]')
    assert len(first_paragraph) == 18
示例#7
0
def test_parent_axis():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news-tagged.kdxa'), 'rb').read())
    first_paragraph = document.select('(//p)[0]')
    assert len(first_paragraph) == 1
    assert len(first_paragraph[0].select('parent::div')) == 1
    assert first_paragraph[0].select('parent::div')[0].node_type == 'div'

    link = document.select('//a')[0]
    assert link.select('parent::div')[0].node_type == 'div'
示例#8
0
def test_tag_key_value():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.mdoc'),
             'rb').read())

    # Collapse out all the <a> tags
    step = ExtractTagsToKeyValuePair(store_name='test_store')
    context = PipelineContext()
    result = step.process(document, context)
    print(context.get_store('test_store').rows)
示例#9
0
def test_tag_key_value():
    document = Document.from_msgpack(
        open(os.path.join(get_test_directory(), 'news-tagged.kdxa'),
             'rb').read())
    step = TagsToKeyValuePairExtractor(store_name='test_store')
    context = PipelineContext()
    step.process(document, context)

    assert context.get_store('test_store').count() == 45
    assert context.get_store('test_store').rows[14][0] == 'LOC'
    assert context.get_store('test_store').rows[14][1] == 'Europe'
示例#10
0
def test_selector_complex_doc_1():
    document = Document.from_msgpack(open(os.path.join(get_test_directory(), 'news.kdxa'), 'rb').read())
    all_nodes = document.content_node.select('//*')
    assert len(all_nodes) == 39

    all_ps = document.content_node.select('//p')
    assert len(all_ps) == 18

    for pos in range(18):
        selected_p = document.content_node.select(f'(//p)[{pos}]')
        assert len(selected_p) == 1
        assert selected_p[0].uuid == all_ps[pos].uuid
示例#11
0
文件: kodexa.py 项目: fossabot/kodexa
    def get_output_document(self, execution):
        final_reference = None
        for document_reference in execution.documentReferences:
            if document_reference.referenceType == 'OUTPUT':
                final_reference = document_reference

        if final_reference:
            doc = requests.get(
                f"{self.cloud_url}/api/sessions/{self.cloud_session.id}/executions/{execution.id}/documents/{final_reference.cloudDocument.id}",
                headers={"x-access-token": self.access_token})
            return Document.from_msgpack(doc.content)
        else:
            return None