def test_pipeline_example(): document_store = LocalDocumentStore() document_store.put("test.doc", create_document()) pipeline = Pipeline(document_store) stats = pipeline.run().statistics assert stats.documents_processed == 1
def test_url_pipeline(): document = Document.from_url("http://www.google.com") new_document_store = LocalDocumentStore() stats = Pipeline(document).add_step(TextParser(encoding='ISO-8859-1')).add_step( DocumentStoreWriter(new_document_store)).run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 new_doc = new_document_store.get_latest_document("http://www.google.com") print(new_doc.content_node.get_all_content())
def test_fluent_pipeline(): def my_function(doc): doc.metadata.cheese = "fishstick" logging.error("Hello") return doc document = create_document() new_document_store = LocalDocumentStore() stats = Pipeline(document).add_step(my_function).add_step(my_function).add_step( DocumentStoreWriter(new_document_store)).run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 assert new_document_store.get_latest_document("test.doc").metadata.cheese == 'fishstick'
def test_basic_local_document_store(): JSON_STORE = "/tmp/test-json-store.jsonkey" document_store = LocalDocumentStore(store_path=JSON_STORE, force_initialize=True) document_store.put("test.doc", create_document()) new_document_store = LocalDocumentStore(store_path=JSON_STORE) assert (new_document_store.count() == 1)
def test_table_stores_with_extractor(): document_store = LocalDocumentStore() document_store.put("test.doc", create_document()) pipeline = Pipeline(document_store, stop_on_exception=False) pipeline.add_store('output', TableDataStore(columns=['cheese'])) def extractor(document, context): # An example of how we might # extract into a dict # context.get_store('output').add(['test']) return document pipeline.add_step(extractor) context = pipeline.run() assert context.get_store('output').count() == 1
def test_function_step_with_context(): document_store = LocalDocumentStore() document_store.put("test.doc", create_document()) new_document_store = LocalDocumentStore() def my_function(doc, context): doc.metadata.cheese = context.execution_id logging.error("Hello") return doc assert new_document_store.count() == 0 pipeline = Pipeline(document_store) pipeline.add_step(my_function) pipeline.add_step(DocumentStoreWriter(new_document_store)) stats = pipeline.run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 assert new_document_store.get_latest_document("test.doc").metadata.cheese == pipeline.context.execution_id
def test_function_step_with_exception(): document_store = LocalDocumentStore() document_store.put("test.doc", create_document()) new_document_store = LocalDocumentStore() def my_function(doc): doc.metadata.cheese = "fishstick" raise Exception("hello world") assert new_document_store.count() == 0 pipeline = Pipeline(document_store, stop_on_exception=False) pipeline.add_step(my_function) pipeline.add_step(DocumentStoreWriter(new_document_store)) stats = pipeline.run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 1 assert new_document_store.count() == 1 assert len(new_document_store.get_latest_document("test.doc").exceptions) == 1
def test_class_step_step_with_context(): document_store = LocalDocumentStore() document_store.put('test.doc', create_document()) new_document_store = LocalDocumentStore() class MyProcessingStep: def get_name(self): return "test-step" def process(self, doc, context): doc.metadata.cheese = context.execution_id logging.error("Hello") return doc pipeline = Pipeline(document_store) pipeline.add_step(MyProcessingStep()) pipeline.add_step(DocumentStoreWriter(new_document_store)) ctxt = pipeline.run() assert ctxt.statistics.documents_processed == 1 assert ctxt.statistics.document_exceptions == 0 assert new_document_store.get_latest_document("test.doc").metadata.cheese == pipeline.context.execution_id