def __next__(self): if self.index > len(self.files) - 1: raise StopIteration self.index += 1 if self.unpack: return Document.from_kdxa(self.files[self.index - 1]) document = Document( DocumentMetadata({ "source_path": self.files[self.index - 1], "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.files[self.index - 1]), "connector_options": { "path": self.path, "file_filter": self.file_filter } })) document.source.original_filename = os.path.basename( self.files[self.index - 1]) document.source.original_path = self.path document.source.connector = self.get_name() # TODO we need to get the checksum and last_updated and created times return document
def get_test_document(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node document.content_node.add_child( document.create_node(type='bar', content='fishstick')) return document
def create_document(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node foo2 = document.create_node(type='bar') foo2.content = "fishstick" document.content_node.add_child(foo2) return document
def create_document(): document = Document(DocumentMetadata()) document.source.original_filename = "test.doc" node = document.create_node(node_type='foo') node.content = "cheese" document.content_node = node foo2 = document.create_node(node_type='bar') foo2.content = "fishstick" document.content_node.add_child(foo2) return document
def __next__(self): if self.completed: raise StopIteration else: return Document( DocumentMetadata({ "source_path": self.file, "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.file), "connector_options": { "file": self.file } }))
def __next__(self): if self.completed: raise StopIteration else: self.completed = True return Document( DocumentMetadata({ "connector": self.get_name(), "connector_options": { "url": self.url, "headers": self.headers } }))
def get_test_document_with_three_children(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node document.content_node.add_child( document.create_node(type='bar', content='fishstick')) document.content_node.add_child( document.create_node(type='bar', content='cheeseburger')) document.content_node.add_child( document.create_node(type='bar', content='beans')) return document
def test_virtual_navigation_with_no_0_index(): document = Document(DocumentMetadata()) document.add_mixin('core') node = document.create_node(type='loopy') node.content = "banana" document.content_node = node document.content_node.add_child(document.create_node(type='loopy', content='banana2'), index=2) assert document.content_node.get_node_at_index(0).content is None assert document.content_node.get_node_at_index( 0).next_node().content is None assert document.content_node.get_node_at_index( 0).next_node().next_node().content is 'banana2'
def __next__(self): if self.completed: raise StopIteration self.completed = True document = Document( DocumentMetadata({ "connector": self.get_name(), "connector_options": { "url": self.url, "headers": self.headers } })) document.source.connector = self.get_name() document.source.original_path = self.url document.source.headers = self.headers return document
def __next__(self): if self.index > len(self.files) - 1: raise StopIteration else: self.index += 1 return Document( DocumentMetadata({ "source_path": self.files[self.index - 1], "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.files[self.index - 1]), "connector_options": { "path": self.path, "file_filter": self.file_filter } }))
def __next__(self): if self.completed: raise StopIteration document = Document( DocumentMetadata({ "source_path": self.file, "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.file), "connector_options": { "file": self.file } })) document.source.original_filename = self.file document.source.original_path = os.path.basename(self.file) document.source.connector = self.get_name() # TODO we need to get the checksum and last_updated and created times return document
def test_url_pipeline(): document = Document( DocumentMetadata({ "connector": "url", "connector_options": { "url": "http://www.google.com" } })) new_document_store = JsonDocumentStore("/tmp/test-json-store", force_initialize=True) stats = Pipeline(document).add_step(TextParser( encoding='ISO-8859-1')).set_sink(new_document_store).run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 new_doc = new_document_store.get_document(0) new_doc.add_mixin('core') print(new_doc.content_node.get_all_content())
def test_virtual_navigation_with_no_0_index(): document = Document(DocumentMetadata()) node = document.create_node(node_type='loopy') node.content = "banana" document.content_node = node document.content_node.add_child(document.create_node(node_type='loopy', content='banana2'), index=2) assert document.content_node.get_node_at_index(0).content is None assert document.content_node.get_node_at_index( 0).next_node().content is None assert document.content_node.get_node_at_index( 0).next_node().next_node().content == 'banana2' test_kddb = document.to_kddb() new_kddb = Document.from_kddb(test_kddb) assert new_kddb.content_node.get_node_at_index(0).content is None assert new_kddb.content_node.get_node_at_index( 0).next_node().content is None assert new_kddb.content_node.get_node_at_index( 0).next_node().next_node().content == 'banana2'