def put(self, path: str, document: Document, force_replace: bool = False) -> DocumentFamily: """ Args: path (str): The path to the document family document (Document): The document you wish to upload force_replace (bool): True if you want to delete the family in this path first Returns: The new document family instance """ # We can only add a document if it doesn't already exist as a family if self.get_family_by_path(path) is None: new_document_family = DocumentFamily(path=path) new_event = self.add_document(new_document_family, document) document.to_kdxa( os.path.join(self.store_path, new_event.content_object.id) + ".kdxa") self.metastore.append(new_document_family) self.write_metastore() # Notify the listeners self.notify_listeners(new_event) document_family = self.get_family_by_path(path) if document_family is not None: return document_family raise Exception("Unable to get document family?")
def __next__(self): if self.index > len(self.files) - 1: raise StopIteration self.index += 1 if self.unpack: return Document.from_kdxa(self.files[self.index - 1]) document = Document( DocumentMetadata({ "source_path": self.files[self.index - 1], "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.files[self.index - 1]), "connector_options": { "path": self.path, "file_filter": self.file_filter } })) document.source.original_filename = os.path.basename( self.files[self.index - 1]) document.source.original_path = self.path document.source.connector = self.get_name() # TODO we need to get the checksum and last_updated and created times return document
def get_test_document(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node document.content_node.add_child( document.create_node(type='bar', content='fishstick')) return document
def test_doc_from_text(): doc = Document.from_text('It is going to be a great day') assert doc.get_root().content == 'It is going to be a great day' assert len(doc.get_root().get_children()) == 0 doc = Document.from_text('It is going to be a great day', separator=' ') assert doc.get_root().content is None assert len(doc.get_root().get_children()) == 8 assert doc.get_root().get_children()[4].content == 'be'
def create_document(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node foo2 = document.create_node(type='bar') foo2.content = "fishstick" document.content_node.add_child(foo2) return document
def create_document(): document = Document(DocumentMetadata()) document.source.original_filename = "test.doc" node = document.create_node(node_type='foo') node.content = "cheese" document.content_node = node foo2 = document.create_node(node_type='bar') foo2.content = "fishstick" document.content_node.add_child(foo2) return document
def replace_content_object(self, document_family: DocumentFamily, content_object_id: str, document: Document) -> Optional[DocumentFamily]: for co in document_family.content_objects: if co.id == content_object_id: document.to_kdxa( os.path.join(self.store_path, content_object_id) + ".kdxa") co.labels = document.labels co.classes = document.classes self.write_metastore() return document_family return None
def process(self, document: Document): """ Args: document: Document: Returns: """ if self.remove: document.remove_label(self.label) else: document.add_label(self.label) return document
def put(self, path: str, document: Document) -> DocumentFamily: from kodexa import KodexaPlatform try: logger.info(f"Putting document to path {path}") files = {"file": document.to_kddb()} data = { "path": path, "documentVersion": document.version, "document": True } document_family_response = requests.post( f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/fs", params={"path": path}, headers={"x-access-token": KodexaPlatform.get_access_token()}, files=files, data=data) if document_family_response.status_code == 200: return DocumentFamily.parse_obj( document_family_response.json()) msg = "Document family create failed [" + document_family_response.text + "], response " + str( document_family_response.status_code) logger.warning(msg) raise Exception(msg) except JSONDecodeError: logger.warning("Unable to decode the JSON response") raise
def replace_content_object(self, document_family: DocumentFamily, content_object_id: str, document: Document) -> DocumentFamily: from kodexa import KodexaPlatform try: logger.info( f"Replacing document in family {document_family.id} content object {content_object_id}" ) files = {"file": document.to_kddb()} content_object_replace = requests.put( f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/families/{document_family.id}/objects/{content_object_id}/content", headers={"x-access-token": KodexaPlatform.get_access_token()}, files=files) if content_object_replace.status_code == 200: return DocumentFamily.parse_obj(content_object_replace.json()) msg = "Document replace failed [" + content_object_replace.text + "], response " + str( content_object_replace.status_code) logger.warning(msg) raise Exception(msg) except JSONDecodeError: logger.warning("Unable to decode the JSON response") raise
def add_related_document_to_family(self, document_family_id: str, transition: DocumentTransition, document: Document) -> ContentObject: from kodexa import KodexaPlatform try: logger.info(f"Putting document to family id {document_family_id}") data = { 'transitionType': transition.transition_type.value, 'documentVersion': document.version, 'document': True, 'sourceContentObjectId': transition.source_content_object_id } files = {"file": document.to_kddb()} document_family_response = requests.post( f"{KodexaPlatform.get_url()}/api/stores/{self.ref.replace(':', '/')}/families/{document_family_id}/objects", headers={"x-access-token": KodexaPlatform.get_access_token()}, data=data, files=files) if document_family_response.status_code == 200: return ContentObject.parse_obj(document_family_response.json()) msg = "Document family create failed [" + document_family_response.text + "], response " + str( document_family_response.status_code) logger.warning(msg) raise Exception(msg) except JSONDecodeError: logger.warning("Unable to decode the JSON response") raise
def get_document_by_content_object(self, document_family: DocumentFamily, content_object: ContentObject) -> \ Optional[Document]: from kodexa import KodexaPlatform get_response = KodexaPlatform.get_client().get( f"api/stores/{self.ref.replace(':', '/')}/families/{document_family.id}/objects/{content_object.id}/content" ) return Document.from_kddb( get_response.content) if get_response is not None else None
def test_kbbd(): doc = Document.from_text('It is going to be a great day') doc.content_node.tag('cheese', fixed_position=[1, 2]) doc.content_node.tag('foo', fixed_position=[3, 4]) doc2 = doc.from_kddb(doc.to_kddb()) assert doc2.content_node.get_all_content( ) == 'It is going to be a great day' assert len(doc2.content_node.get_features()) == 2
def load(self, document_id: str): """ Loads the document with the given document ID :return the document """ with open(os.path.join(self.store_path, document_id + '.json'), encoding='utf8') as f: return Document.from_json(f.read())
def put_native(self, path: str, content: Any, force_replace=False): """ Args: path (str): The path to the native file content (Any): The content to store force_replace (bool): Replace the object in the store Returns: """ # In order to store a native document we will first get the family # then we will create a content object for the native object # and also a content object for the document that references it family = self.get_family_by_path(path) if family is None: family = DocumentFamily(path=path) self.metastore.append(family) native_content_object = ContentObject(**{'contentType': 'NATIVE'}) native_content_object.id = str(uuid.uuid4()).replace("-", "") native_content_object.created_on = datetime.now() if family.content_objects is None: family.content_objects = [] family.content_objects.append(native_content_object) with open(os.path.join(self.store_path, native_content_object.id), 'wb') as file: file.write(content) document = Document() document.source.connector = "document-store" document.source.headers = { "ref": family.store_ref, "family": family.id, "id": native_content_object.id } content_event = self.add_document(family, document) document.to_kdxa( os.path.join(self.store_path, content_event.content_object.id) + ".kdxa")
def load_kdxa(self, path: str): """ Args: path: str: Returns: """ document = Document.from_kdxa(path) self.put(document.uuid, document)
def add_related_document_to_family(self, document_family_id: str, transition: DocumentTransition, document: Document): """ Args: document_family_id: str: transition: DocumentTransition: document: Document: Returns: """ self.read_metastore() for family in self.metastore: if family.id == document_family_id: new_event = self.add_document(family, document, transition) document.to_kdxa( os.path.join(self.store_path, new_event.content_object.id) + ".kdxa") self.write_metastore()
def test_url_pipeline(): document = Document.from_url("http://www.google.com") new_document_store = LocalDocumentStore() stats = Pipeline(document).add_step(TextParser(encoding='ISO-8859-1')).add_step( DocumentStoreWriter(new_document_store)).run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 new_doc = new_document_store.get_latest_document("http://www.google.com") print(new_doc.content_node.get_all_content())
def __next__(self): if self.completed: raise StopIteration else: return Document( DocumentMetadata({ "source_path": self.file, "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.file), "connector_options": { "file": self.file } }))
def __next__(self): if self.completed: raise StopIteration else: self.completed = True return Document( DocumentMetadata({ "connector": self.get_name(), "connector_options": { "url": self.url, "headers": self.headers } }))
def from_file(file_path: str, *args, **kwargs) -> Pipeline: """Create a new pipeline using a file path as a source Args: file_path: The path to the file file_path: str: *args: **kwargs: Returns: Pipeline: A new pipeline """ return Pipeline(Document.from_file(file_path), *args, **kwargs)
def from_text(text: str, *args, **kwargs) -> Pipeline: """Build a new pipeline and provide text as the basic to create a document Args: text: Text to use to create document text: str: *args: **kwargs: Returns: Pipeline: A new pipeline """ return Pipeline(Document.from_text(text), *args, **kwargs)
def from_url(url, headers=None, *args, **kwargs): """Build a new pipeline with the input being a document created from the given URL Args: url: The URL ie. https://www.google.com headers: A dictionary of headers (Default value = None) *args: **kwargs: Returns: A new instance of a pipeline """ return Pipeline(Document.from_url(url, headers), *args, **kwargs)
def get_document_by_content_object( self, document_family: DocumentFamily, content_object: ContentObject) -> Document: """ Args: document_family (DocumentFamily): The document family content_object (ContentObject): The content object Returns: The Kodexa document related to the content family """ return Document.from_kdxa( os.path.join(self.store_path, content_object.id) + ".kdxa")
def get_by_path(self, path: str) -> Optional[Document]: """Return the latest document in the family at the given path Args: path: return: path: str: Returns: """ for family in self.metastore: if family.path == path: return Document.from_kdxa( os.path.join(self.store_path, family.get_latest_content().id) + ".kdxa") return None
def get_test_document_with_three_children(): document = Document(DocumentMetadata()) node = document.create_node(type='foo') node.content = "cheese" document.content_node = node document.content_node.add_child( document.create_node(type='bar', content='fishstick')) document.content_node.add_child( document.create_node(type='bar', content='cheeseburger')) document.content_node.add_child( document.create_node(type='bar', content='beans')) return document
def __next__(self): if self.completed: raise StopIteration self.completed = True document = Document( DocumentMetadata({ "connector": self.get_name(), "connector_options": { "url": self.url, "headers": self.headers } })) document.source.connector = self.get_name() document.source.original_path = self.url document.source.headers = self.headers return document
def get_by_uuid(self, uuid: str) -> Optional[Document]: """ Args: uuid: str: Returns: """ for family in self.metastore: for content_object in family.content_objects: if content_object.id == uuid: return Document.from_kdxa( os.path.join(self.store_path, content_object.id) + ".kdxa") return None
def test_virtual_navigation_with_no_0_index(): document = Document(DocumentMetadata()) document.add_mixin('core') node = document.create_node(type='loopy') node.content = "banana" document.content_node = node document.content_node.add_child(document.create_node(type='loopy', content='banana2'), index=2) assert document.content_node.get_node_at_index(0).content is None assert document.content_node.get_node_at_index( 0).next_node().content is None assert document.content_node.get_node_at_index( 0).next_node().next_node().content is 'banana2'
def __next__(self): if self.index > len(self.files) - 1: raise StopIteration else: self.index += 1 return Document( DocumentMetadata({ "source_path": self.files[self.index - 1], "connector": self.get_name(), "mime_type": mimetypes.guess_type(self.files[self.index - 1]), "connector_options": { "path": self.path, "file_filter": self.file_filter } }))