示例#1
0
 def __init__(self, **kwargs):
     self.status = None
     self.flags = set()
     self.file_path = decode_path(kwargs.get('file_path'))
     file_name = kwargs.get('file_name') or os.path.basename(self.file_path)
     self.file_name = decode_path(file_name)
     self.id = kwargs.get('id') or self.file_path
     self.title = kwargs.get('title')
     self.summary = kwargs.get('summary')
     self.date = kwargs.get('date')
     self.created_at = kwargs.get('created_at')
     self.modified_at = kwargs.get('modified_at')
     self.published_at = kwargs.get('published_at')
     self.author = kwargs.get('author')
     self.generator = kwargs.get('generator')
     self.keywords = kwargs.get('keywords') or []
     self.emails = kwargs.get('emails') or []
     self.entities = kwargs.get('entities') or []
     self.mime_type = kwargs.get('mime_type')
     self.encoding = kwargs.get('encoding')
     self.languages = kwargs.get('languages')
     self.headers = kwargs.get('headers')
     self.error_message = None
     self.checksum = kwargs.get('checksum')
     self.size = kwargs.get('size')
     self.pages = []
     self.body_text = None
     self.body_html = None
     self.rows = []
     self.children = []
     self.pdf_path = None
示例#2
0
def crawldir(path, language=None, country=None, foreign_id=None):
    """Crawl the given directory."""
    path = decode_path(path)
    if path is None or not os.path.exists(path):
        log.error("Invalid path: %r", path)
        return
    path = os.path.abspath(os.path.normpath(path))
    path_name = os.path.basename(path)

    if foreign_id is None:
        foreign_id = 'directory:%s' % slugify(path)
    collection = Collection.by_foreign_id(foreign_id)
    if collection is None:
        collection = Collection.create({
            'foreign_id': foreign_id,
            'label': path_name,
            'managed': True
        })

    if language is not None:
        collection.languages = [language]
    if country is not None:
        collection.countries = [country]
    db.session.commit()
    update_collection(collection)

    log.info('Crawling %r to %r...', path, collection.foreign_id)
    document = Document.by_keys(collection=collection,
                                foreign_id=path)
    document.file_name = path_name
    ingest_document(document, path)
示例#3
0
 def create_temp_dir(self, *args, **kwargs):
     """Creates a temporary folder and removes it later."""
     temp_dir = tempfile.mkdtemp(*args, **kwargs)
     try:
         yield decode_path(temp_dir)
     finally:
         remove_directory(temp_dir)
示例#4
0
    def handle_child(self, parent, file_path, title=None, mime_type=None,
                     id=None, file_name=None):
        file_path = decode_path(file_path)
        file_name = decode_path(file_name) or os.path.basename(file_path)

        content_hash = None
        if not os.path.isdir(file_path):
            content_hash = checksum(file_path)

        document = Document.by_keys(parent_id=parent.document.id,
                                    collection=parent.document.collection,
                                    foreign_id=id, content_hash=content_hash)
        document.title = title or document.meta.get('title')
        document.file_name = file_name or document.meta.get('file_name')
        document.mime_type = mime_type or document.meta.get('mime_type')

        from aleph.ingest import ingest_document
        ingest_document(document, file_path, user_queue=parent.user_queue)
示例#5
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        self.result.flag(self.result.FLAG_DIRECTORY)
        file_path = decode_path(file_path)

        if file_path is None or not os.path.isdir(file_path):
            return

        for name in os.listdir(file_path):
            name = decode_path(name)
            if name in self.SKIP_ENTRIES:
                continue
            sub_path = join_path(file_path, name)
            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      sub_path,
                                      file_name=name,
                                      id=child_id)
示例#6
0
    def handle_child(self, parent, file_path, title=None, mime_type=None,
                     id=None, file_name=None):
        file_path = decode_path(file_path)
        assert id is not None, (parent, file_path)

        doc = Document.by_keys(parent_id=parent.document.id,
                               collection=parent.document.collection,
                               foreign_id=id)
        doc.title = title or doc.meta.get('title')
        doc.file_name = file_name or doc.meta.get('file_name')
        doc.mime_type = mime_type or doc.meta.get('mime_type')

        from aleph.logic.documents.ingest import ingest_document
        ingest_document(doc, file_path, role_id=parent.document.uploader_id)
        return DocumentResult(self, doc, file_path=file_path)
示例#7
0
    def ingest(self, file_path):
        """Ingestor implementation."""
        if not os.path.isdir(file_path):
            raise ProcessingException("Not a directory.")

        self.result.flag(self.result.FLAG_DIRECTORY)

        for name in os.listdir(file_path):
            name = decode_path(name)
            if name in self.SKIP_ENTRIES:
                continue
            sub_path = join_path(file_path, name)
            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      sub_path,
                                      file_name=name,
                                      id=child_id)
示例#8
0
def crawldir(path, language=None, foreign_id=None):
    """Crawl the given directory."""
    path = decode_path(os.path.abspath(os.path.normpath(path)))
    if path is None or not os.path.exists(path):
        log.error("Invalid path: %r", path)
        return
    path_name = os.path.basename(path)

    if foreign_id is None:
        foreign_id = 'directory:%s' % slugify(path)

    collection = create_collection(foreign_id, {
        'label': path_name,
        'languages': language
    })
    log.info('Crawling %s to %s...', path, foreign_id)
    document = Document.by_keys(collection=collection, foreign_id=path)
    document.file_name = path_name
    db.session.commit()
    ingest_document(document, path)