def ingest_document(document, file_path, role_id=None, shallow=False): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if not is_file(file_path): manager = get_manager() manager.ingest_document(document, file_path=file_path, role_id=role_id, shallow=shallow) else: document.content_hash = archive.archive_file(file_path) db.session.commit() priority = 5 if document.collection.casefile else 3 ingest.apply_async(args=[document.id], kwargs={'role_id': role_id}, priority=priority)
def ingest_document(document, file_path, role_id=None): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if not is_file(file_path): manager = get_manager() manager.ingest_document(document, file_path=file_path, role_id=role_id) else: document.content_hash = archive.archive_file(file_path) db.session.commit() queue = WORKER_QUEUE routing_key = WORKER_ROUTING_KEY if role_id is not None: queue = USER_QUEUE routing_key = USER_ROUTING_KEY ingest.apply_async(args=[document.id], kwargs={'role_id': role_id}, queue=queue, routing_key=routing_key)
def checksum_file(self, result, file_path): "Generate a hash and file size for a given file name." if not is_file(file_path): return if result.checksum is None: checksum = hashlib.sha1() size = 0 with open(file_path, 'rb') as fh: while True: block = fh.read(8192) if not block: break size += len(block) checksum.update(block) result.checksum = checksum.hexdigest() result.size = size if result.size is None: result.size = os.path.getsize(file_path)
def auction(self, file_path, result): if not is_file(file_path): result.mime_type = DirectoryIngestor.MIME_TYPE return DirectoryIngestor if not useful_mimetype(result.mime_type): mime_type = self.MAGIC.from_file(file_path) result.mime_type = normalize_mimetype(mime_type) best_score, best_cls = 0, None for cls in self.ingestors: result.manager = self score = cls.match(file_path, result=result) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise ProcessingException("Format not supported: %s" % result.mime_type) return best_cls