def ingest(document_id, file_path=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) db.session.commit() process_document(document) except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) assert id is not None, (parent, file_path) doc = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id) doc.title = title or doc.meta.get('title') doc.file_name = file_name or doc.meta.get('file_name') doc.mime_type = mime_type or doc.meta.get('mime_type') from aleph.logic.documents.ingest import ingest_document ingest_document(doc, file_path, role_id=parent.document.uploader_id) return DocumentResult(self, doc, file_path=file_path)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) # delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)