示例#1
0
def test_interesting_pipeline():
    training_documents = LocalDocumentStore()

    # If we have a store, can we determine that we only want to process
    # documents that we haven't already got in the store

    def test_step(document):
        return document

    def test_model_store(document, context):
        context.get_store('my-model-store').put_native(
            'cheese.txt', 'so cheesy'.encode('ascii'))
        return document

    training_prep = Pipeline.from_text("hello world", apply_lineage=False)
    training_prep.add_step(test_step)
    training_prep.add_label('training_document')
    training_prep.add_step(DocumentStoreWriter(training_documents))
    training_prep.run()

    assert training_documents.count() == 1

    model_store = LocalModelStore()
    training_pipeline = Pipeline.from_store(training_documents)
    training_pipeline.add_store('my-model-store', model_store)
    training_pipeline.add_step(test_model_store)

    training_pipeline.run()

    assert model_store.get_native('cheese.txt').read().decode(
        'ascii') == 'so cheesy'
示例#2
0
def get_test_pipeline(filename):
    pipeline = Pipeline(
        FolderConnector(path=str(get_test_directory()),
                        file_filter=filename + '.txt'))
    pipeline.add_step(TextParser())
    context = pipeline.run()

    # Make sure the finders are available
    document = context.output_document
    return document
示例#3
0
def test_folder_connector_unpack_wildcard():
    document_sink = LocalDocumentStore()
    pipeline = Pipeline(
        FolderConnector(path=str(get_test_directory()) + 'folder_unpack_test',
                        file_filter='*.*',
                        unpack=True))
    pipeline.run()

    # let's make sure we properly unpacked each document and have all ContentNodes
    for document_family in document_sink.query_families():
        doc = document_sink.get_latest_document_in_family(document_family)
        if doc.get_root().get_all_content().find('HSBC') > -1:
            assert len(doc.select("//*")) == 39
        elif doc.get_root().get_all_content().find('flea') > -1:
            assert len(doc.select("//*")) == 6
示例#4
0
def test_function_step_to_yaml():
    pipeline = Pipeline.from_file('test')

    def do_it(document):
        print("hello")
        return document

    pipeline.add_step(do_it)
    print(pipeline.to_yaml())
示例#5
0
def test_lines_of_text():
    # first test with all content being placed on root ContentNode
    pipeline = Pipeline.from_file(get_test_directory() + 'multiline_text.txt')
    pipeline.add_step(TextParser)
    context = pipeline.run()

    doc = context.output_document
    assert len(doc.get_root().get_children()) == 0
    assert len(doc.get_root().get_all_content()) > 0

    # next, test with all content being placed the root's children
    pipeline = Pipeline.from_file(get_test_directory() + 'multiline_text.txt')
    pipeline.add_step(TextParser(lines_as_child_nodes=True))
    context = pipeline.run()

    doc = context.output_document
    assert len(doc.get_root().get_children()) > 0
    assert doc.get_root().get_content() is None
示例#6
0
def test_kodexa_service():
    document_sink = InMemoryDocumentSink()

    pipeline = Pipeline(FolderConnector(path=str(get_test_directory()), file_filter='*.pdf'))
    pipeline.add_step(KodexaCloudService(slug='kodexa/pdf-parse', attach_source=True))
    pipeline.set_sink(document_sink)
    pipeline.run()

    # Make sure the finders are available
    document = document_sink.get_document(0)

    assert document

    print(document.to_json())
示例#7
0
def get_test_pipeline(filename):
    document_sink = InMemoryDocumentSink()

    pipeline = Pipeline(
        FolderConnector(path=str(get_test_directory()),
                        file_filter=filename + '.txt'))
    pipeline.add_step(TextParser(decode=True))
    pipeline.set_sink(document_sink)
    pipeline.run()

    # Make sure the finders are available
    document = document_sink.get_document(0)
    registry.add_mixin_to_document("core", document)
    return document
示例#8
0
def test_spatial_doc_sample_two():
    # This test document and this portion of code is a snippet
    # from a test in the spatial actions tests.  Adding this saved doc
    # and this section to ensure NodeTagger is tested.
    page_footer_re = r'Page \d+ of \d+$'
    document = Document.from_kdxa(get_test_directory() + 'before_fail.kdxa')
    pipeline = Pipeline(document)

    pipeline.add_step(
        NodeTagger(selector='//*[typeRegex("line.*")]', content_re=page_footer_re, tag_to_apply='page_footer'))
    pipeline.run()

    doc = pipeline.context.output_document

    assert doc.get_root() is not None
示例#9
0
def test_predefined_table_store():
    def process(document, context):
        if context.get_store('prediction-data-store'):
            document.get_root().content = 'We have a data store name'
        elif context.get_store_names() and len(context.get_store_names()) > 0:
            document.get_root().content = ' '.join(context.get_store_names())
        else:
            document.get_root().content = 'No stores on context'

        return document

    pipeline = Pipeline.from_text("Hello World")
    pipeline.add_store('prediction-data-store', TableDataStore())
    pipeline.add_step(process)

    context = pipeline.run()

    new_doc = context.output_document
    print(new_doc.content_node.content)

    assert new_doc.content_node.content == 'We have a data store name'
示例#10
0
def test_table_data_store():
    # Testing with 'include_node_content' set to True.  Should result in 3 columns
    pipeline = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True,
                   node_tag_uuid='test'))
    pipeline.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data',
                                    include_node_content=True))
    context = pipeline.run()

    compare_store(context, 'tagged_data', 'basic_store_tagged_data1.json')

    # Testing with 'include_node_content' set to False.  Should result in 2 columns
    pipeline2 = Pipeline(
        Document.from_kdxa(
            os.path.join(get_test_directory(), 'tongue_twister.kdxa')))
    pipeline2.add_step(
        NodeTagger(selector='//*[contentRegex(".*flue.*")]',
                   tag_to_apply='has_flue',
                   node_only=True))
    pipeline2.add_step(
        TagsToKeyValuePairExtractor(store_name='tagged_data_2',
                                    include_node_content=False))
    context2 = pipeline2.run()

    compare_store(context2, 'tagged_data_2', 'basic_store_tagged_data2.json')
示例#11
0
def test_to_yaml():
    # Create the pipeline

    pipeline = Pipeline.from_file('examples/USBankSample.pdf')
    pipeline.add_step(
        RemoteStep(ref='kodexa/pdf-parser',
                   options={
                       "layout_analysis_options": {
                           "rollup": "word",
                           "space_multiplier": 1
                       },
                       "analyze_layout": True
                   },
                   attach_source=True))

    col_space_multiplier = 3.0
    page_number_re = ".*Page \d+ of \d+$"

    transactions_header_re = '^Date\s+Description.*\s+Amount$'
    continued_re = '^.*\(continued\)$'

    # Extract Other Deposits
    other_deposits_table_tag_name = "Other Deposits"
    other_deposits_re = '^Other Deposits$'
    total_other_deposits_re = '^Total Other Deposits.*\d{2}$'
    balance_re = '^BALANCE YOUR ACCOUNT$'
    pipeline.add_step(
        RemoteStep(ref='kodexa/pattern-table-tagger',
                   options={
                       "col_space_multiplier": col_space_multiplier,
                       "tag_to_apply": other_deposits_table_tag_name,
                       "page_start_re": other_deposits_re,
                       "page_end_re": total_other_deposits_re,
                       "table_start_re": transactions_header_re,
                       "table_end_re": balance_re,
                       "col_marker_re": transactions_header_re,
                       "extract": True,
                       "extract_options": {
                           'store_name': other_deposits_table_tag_name,
                           'header_lines_count': 1,
                           'first_col_has_text': True
                       }
                   }))

    # Extract Card Withdrawals
    card_withdrawals_table_tag_name = "Card Withdrawals"
    card_withdrawals_re = '^Card Withdrawals$'
    subtotal_card_withdrawals_re = '^Card \d{4} Withdrawals Subtotal.*\d{2}.$'
    total_card_withdrawals_re = '^Total Card Withdrawals.*\d{2}.$'
    pipeline.add_step(
        RemoteStep(ref='kodexa/pattern-table-tagger',
                   options={
                       "col_space_multiplier": col_space_multiplier,
                       "tag_to_apply": card_withdrawals_table_tag_name,
                       "page_start_re": card_withdrawals_re,
                       "page_end_re": total_card_withdrawals_re,
                       "table_start_re": transactions_header_re,
                       "table_end_re": subtotal_card_withdrawals_re,
                       "col_marker_re": transactions_header_re,
                       "extract": True,
                       "extract_options": {
                           'store_name': card_withdrawals_table_tag_name,
                           'header_lines_count': 1,
                           'first_col_has_text': True
                       }
                   }))

    # Extract Other Withdrawals
    other_withdrawals_table_tag_name = "Other Withdrawals"
    other_withdrawals_re = '^Other Withdrawals$'
    total_other_withdrawals_re = '^Total Other Withdrawals.*\d{2}.$'
    pipeline.add_step(
        RemoteStep(ref='kodexa/pattern-table-tagger',
                   options={
                       "col_space_multiplier": col_space_multiplier,
                       "tag_to_apply": other_withdrawals_table_tag_name,
                       "page_start_re": other_withdrawals_re,
                       "page_end_re": total_other_withdrawals_re,
                       "table_start_re": transactions_header_re,
                       "table_end_re": '',
                       "col_marker_re": transactions_header_re,
                       "extract": True,
                       "extract_options": {
                           'store_name': other_withdrawals_table_tag_name,
                           'header_lines_count': 1,
                           'first_col_has_text': True
                       }
                   }))

    # Extract Checks
    checks_table_tag_name = "Checks"
    check_transactions_re = '^Check Date .* Ref Number Amount$'
    checks_re = '^Checks Presented Conventionally$'
    checks_paid_re = '.*Conventional Checks Paid.*\d{2}.$'

    pipeline.add_step(
        RemoteStep(ref='kodexa/pattern-table-tagger',
                   options={
                       "col_space_multiplier": col_space_multiplier,
                       "tag_to_apply": checks_table_tag_name,
                       "page_start_re": checks_re,
                       "page_end_re": checks_paid_re,
                       "table_start_re": check_transactions_re,
                       "table_end_re": '',
                       "col_marker_re": check_transactions_re,
                       "extract": True,
                       "extract_options": {
                           'store_name': checks_table_tag_name,
                           'header_lines_count': 1,
                           'first_col_has_text': True,
                           'tables_in_page_count': 2
                       }
                   }))

    print(pipeline.to_yaml())
示例#12
0
def test_rollup_of_pdf():
    # first test - collapsing words and lines up to their common parent
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    # how many pre-rollup lines?
    assert len(test_doc.select('//line')) == 3824
    # how many pre-rollup words?
    assert len(test_doc.select('//word')) == 52903
    # how many pre-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the pre-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["word", "line"],
                          separator_character=' '))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # second test - just collapse the line up to its parent (content-area) - roll up the line's children
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(collapse_type_res=["line"],
                          separator_character=' ',
                          get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # how many post-rollup lines?
    assert len(test_doc.select('//line')) == 0
    # how many post-rollup words?
    assert len(test_doc.select('//word')) == 0
    # how many post-rollup content-areas?
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235

    # third test - select specific nodes in which we'll do the roll ups
    test_doc = Document.from_kdxa(get_test_directory() + '20200709.kdxa')

    node_selector = "//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]"

    # verify we have 3 nodes match this selector
    node_matches = test_doc.select(node_selector)
    assert len(node_matches) == 3

    # before we rollup, let's make sure the matching nodes conform to known expectations
    assert len(node_matches[0].select('//word')) == 2
    assert len(node_matches[0].select('//line')) == 1
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 2
    assert len(node_matches[1].select('//line')) == 1
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 71
    assert len(node_matches[2].select('//line')) == 6
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    rollup_pipeline = Pipeline(test_doc)
    rollup_pipeline.add_step(
        RollupTransformer(
            selector="//content-area[contentRegex('.*LOAN AGREEMENT.*', true)]",
            collapse_type_res=["line"],
            separator_character=' ',
            get_all_content=True))
    rollup_pipeline.run()

    collapsed_doc = rollup_pipeline.context.output_document

    # check those matching nodes - we shouldn't have any words or lines, but
    # all other node_types should exist and the content should stay the same.
    assert len(node_matches[0].select('//word')) == 0
    assert len(node_matches[0].select('//line')) == 0
    assert len(node_matches[0].select('//content-area')) == 1
    assert len(node_matches[0].get_all_content()) == 14

    assert len(node_matches[1].select('//word')) == 0
    assert len(node_matches[1].select('//line')) == 0
    assert len(node_matches[1].select('//content-area')) == 1
    assert len(node_matches[1].get_all_content()) == 14

    assert len(node_matches[2].select('//word')) == 0
    assert len(node_matches[2].select('//line')) == 0
    assert len(node_matches[2].select('//content-area')) == 1
    assert len(node_matches[2].get_all_content()) == 500

    # how many post-rollup lines? (still have some lines, but fewer than we started with)
    assert len(test_doc.select('//line')) == 3816
    # how many post-rollup words? (still have some words, but fewer than we started with)
    assert len(test_doc.select('//word')) == 52828
    # how many post-rollup content-areas? (same number of content-areas)
    assert len(test_doc.select('//content-area')) == 817
    # what is the post-rollup length of ALL the content in the document?
    assert len(test_doc.get_root().get_all_content()) == 329792

    # verify that we can collapse line nodes AND include their children
    assert len(
        collapsed_doc.select("//content-area")[12].get_all_content()) == 235
示例#13
0
def test_tag_multiple_regex_matches():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."

    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect 4 tags to be applied, one for each instance of the word 'little'
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(feature_values) == list and len(feature_values) == 4
    assert feature_values[2]['start'] == 37
    assert feature_values[2]['end'] == 43

    # Because we didn't pass in a tag_uuid to the NodeTagger, each of the feature values should have a different UUID
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 4

    # Run the multiple tag test again, but this time pass in a tag_uuid
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False,
                                 node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # Now each of the feature values should have the same UUID
    feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    features_uuids = list(set(dic['uuid'] for dic in feature_values))
    assert len(features_uuids) == 1

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    tags = context.output_document.get_root().get_all_tags()
    assert len(tags) == 1

    # we expect one tag to be applied and there to be no start or end value
    feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert feature_values['start'] is None and feature_values['end'] is None
示例#14
0
def test_tag_copy():
    doc_string = "Mary had a little lamb, little lamb, little lamb.  Mary had a little lamb whose fleece was white as snow."
    # data setup - creating a single tag with multiple matches...and then copying it
    document = Document.from_text(doc_string)
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE', content_re=r'(little)', node_only=False))
    context = pipeline.run()

    # both existing and new tag names must be provided, and they must be different, test for that first.
    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name=None, new_tag_name='NewTagNone')

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name=None)

    for n in document.select('//*[hasTag("SIZE")]'):
        n.copy_tag(existing_tag_name='SIZE', new_tag_name='SIZE')

    # verify that the only tag that exists is tag 'SIZE' and that there are only 4 feature values for it
    assert len(document.get_root().get_all_tags()) == 1
    assert 'SIZE' in document.get_root().get_all_tags()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    # reusing the previously tagged document and testing out NodeTagCopy action
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagCopy(selector='//*[hasTag("SIZE")]', existing_tag_name='SIZE', new_tag_name='LAMB_INFO'))
    context = pipeline.run()

    # we should now have 4 feature values for 'LAMB_INFO' and 4 feature values for 'SIZE' - all with different UUIDs
    size_feature_values = context.output_document.get_root().get_feature_values('tag', 'SIZE')
    assert type(size_feature_values) == list and len(size_feature_values) == 4

    lamb_info_feature_values = context.output_document.get_root().get_feature_values('tag', 'LAMB_INFO')
    assert type(lamb_info_feature_values) == list and len(lamb_info_feature_values) == 4
    lamb_info_features_uuids = set(dic['uuid'] for dic in lamb_info_feature_values)
    assert len(list(lamb_info_features_uuids)) == 4

    # Now test that tagging the entire node, rather than references within the node, only produce 1 feature
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(NodeTagger(selector='//*', tag_to_apply='SIZE_2', content_re=r'.*(little).*', node_only=True))
    context = pipeline.run()

    # now, let's copy the SIZE_2 tags and create new ones called LAMB_INFO (using node's tag_copy)
    for n in document.select('//*[hasTag("SIZE_2")]'):
        n.copy_tag(existing_tag_name='SIZE_2', new_tag_name='LAMB_INFO_2')

    # we should now have 1 feature values for 'LAMB_INFO_2' and 1 feature values for 'SIZE_2'
    size_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'SIZE_2')
    assert type(size_2_feature_values) != list
    lamb_info_2_feature_values = context.output_document.get_root().get_feature_value('tag', 'LAMB_INFO_2')
    assert type(lamb_info_2_feature_values) != list

    # now we need to test that when features are related (indicated by the same tag_uuid), they remain related when copying
    document = Document.from_text(doc_string)  # starting with a clean document
    pipeline = Pipeline(document)
    pipeline.add_step(
        NodeTagger(selector='//*', tag_to_apply='FLEECE_INFO', content_re=r'((white|snow))', node_only=False,
                   node_tag_uuid=str(uuid.uuid4())))
    context = pipeline.run()

    # now, let's copy the SIZE tags and create new ones called LAMB_INFO
    pipeline = Pipeline(document)  # reusing the previously tagged document & testing out the NodeTagCopy action
    pipeline.add_step(
        NodeTagCopy(selector='//*[hasTag("FLEECE_INFO")]', existing_tag_name='FLEECE_INFO', new_tag_name='WOOL_INFO'))
    context = pipeline.run()

    # The feature values should have the same UUID - for both WOOL_INFO and FLEECE_INFO
    wool_values = context.output_document.get_root().get_feature_values('tag', 'WOOL_INFO')
    assert type(wool_values) == list and len(wool_values) == 2
    wool_uuids = set(dic['uuid'] for dic in wool_values)
    assert len(list(wool_uuids)) == 1

    fleece_info_values = context.output_document.get_root().get_feature_values('tag', 'FLEECE_INFO')
    assert type(fleece_info_values) == list and len(fleece_info_values) == 2