def load_fixtures(self, file_name): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() for document in Document.all(): index_document(document) self.update_index()
def process_document(document): """Perform post-ingest tasks like analysis and indexing.""" analyze_document(document) index_document(document) index_records(document) if document.collection.casefile: index_collection(document.collection)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) # delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)
def ingest_document(document, file_path, role_id=None): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if role_id is not None: document.uploader_id = role_id if file_path is not None: # Directories cannot be archived first and then processed # later. So they are effectively sent into a short-cut here if os.path.isdir(file_path): db.session.commit() return ingest(document.id, file_path=file_path) document.content_hash = archive.archive_file(file_path) db.session.commit() index_document(document) priority = 5 if document.collection.casefile else 3 ingest.apply_async(args=[document.id], priority=priority)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. analyze_document_id.apply_async([document.id], queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) index.index_document(document)
def process_document(document): """Perform post-ingest tasks like analysis and indexing.""" analyze_document(document) index.index_document(document) index.index_records(document)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. process_document_id.delay(document.id) index.index_document(document)
def process_document(document): """Perform post-ingest tasks like analysis and indexing.""" extract_document_tags(document) refresh_entity(document) index_document(document)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. index.index_document(document)
def update_document(document, shallow=False, sync=False): # These are operations that should be executed after each # write to a document or its metadata. data = index.index_document(document, shallow=shallow, sync=sync) refresh_collection(document.collection, sync=sync) return data
def index_document_id(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return index.index_document(document, shallow=False, sync=False)
def update_document(document, shallow=False, sync=False): # These are operations that should be executed after each # write to a document or its metadata. refresh_entity(document, sync=sync) return index.index_document(document, shallow=shallow, sync=sync)
def process_document(document): """Perform post-ingest tasks like analysis and indexing.""" extract_document_tags(document) index_document(document) flush_collection_stats(document.collection_id)