Пример #1
0
def ingest_document(document, file_path, role_id=None, shallow=False):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if not is_file(file_path):
        manager = get_manager()
        manager.ingest_document(document,
                                file_path=file_path,
                                role_id=role_id,
                                shallow=shallow)
    else:
        document.content_hash = archive.archive_file(file_path)
        db.session.commit()
        priority = 5 if document.collection.casefile else 3
        ingest.apply_async(args=[document.id],
                           kwargs={'role_id': role_id},
                           priority=priority)
Пример #2
0
def ingest_document(document, file_path, role_id=None):
    """Given a stub document and file path, extract information.
    This does not attempt to infer metadata such as a file name."""
    document.status = Document.STATUS_PENDING
    if not is_file(file_path):
        manager = get_manager()
        manager.ingest_document(document, file_path=file_path, role_id=role_id)
    else:
        document.content_hash = archive.archive_file(file_path)
        db.session.commit()
        queue = WORKER_QUEUE
        routing_key = WORKER_ROUTING_KEY
        if role_id is not None:
            queue = USER_QUEUE
            routing_key = USER_ROUTING_KEY
        ingest.apply_async(args=[document.id],
                           kwargs={'role_id': role_id},
                           queue=queue,
                           routing_key=routing_key)
Пример #3
0
    def checksum_file(self, result, file_path):
        "Generate a hash and file size for a given file name."
        if not is_file(file_path):
            return

        if result.checksum is None:
            checksum = hashlib.sha1()
            size = 0
            with open(file_path, 'rb') as fh:
                while True:
                    block = fh.read(8192)
                    if not block:
                        break
                    size += len(block)
                    checksum.update(block)

            result.checksum = checksum.hexdigest()
            result.size = size

        if result.size is None:
            result.size = os.path.getsize(file_path)
Пример #4
0
    def auction(self, file_path, result):
        if not is_file(file_path):
            result.mime_type = DirectoryIngestor.MIME_TYPE
            return DirectoryIngestor

        if not useful_mimetype(result.mime_type):
            mime_type = self.MAGIC.from_file(file_path)
            result.mime_type = normalize_mimetype(mime_type)

        best_score, best_cls = 0, None
        for cls in self.ingestors:
            result.manager = self
            score = cls.match(file_path, result=result)
            if score > best_score:
                best_score = score
                best_cls = cls

        if best_cls is None:
            raise ProcessingException("Format not supported: %s" %
                                      result.mime_type)
        return best_cls