def __init__(self, **kwargs): self.status = None self.flags = set() self.file_path = decode_path(kwargs.get('file_path')) file_name = kwargs.get('file_name') or os.path.basename(self.file_path) self.file_name = decode_path(file_name) self.id = kwargs.get('id') or self.file_path self.title = kwargs.get('title') self.summary = kwargs.get('summary') self.date = kwargs.get('date') self.created_at = kwargs.get('created_at') self.modified_at = kwargs.get('modified_at') self.published_at = kwargs.get('published_at') self.author = kwargs.get('author') self.generator = kwargs.get('generator') self.keywords = kwargs.get('keywords') or [] self.emails = kwargs.get('emails') or [] self.entities = kwargs.get('entities') or [] self.mime_type = kwargs.get('mime_type') self.encoding = kwargs.get('encoding') self.languages = kwargs.get('languages') self.headers = kwargs.get('headers') self.error_message = None self.checksum = kwargs.get('checksum') self.size = kwargs.get('size') self.pages = [] self.body_text = None self.body_html = None self.rows = [] self.children = [] self.pdf_path = None
def crawldir(path, language=None, country=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(path) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path = os.path.abspath(os.path.normpath(path)) path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': path_name, 'managed': True }) if language is not None: collection.languages = [language] if country is not None: collection.countries = [country] db.session.commit() update_collection(collection) log.info('Crawling %r to %r...', path, collection.foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name ingest_document(document, path)
def create_temp_dir(self, *args, **kwargs): """Creates a temporary folder and removes it later.""" temp_dir = tempfile.mkdtemp(*args, **kwargs) try: yield decode_path(temp_dir) finally: remove_directory(temp_dir)
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) file_name = decode_path(file_name) or os.path.basename(file_path) content_hash = None if not os.path.isdir(file_path): content_hash = checksum(file_path) document = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id, content_hash=content_hash) document.title = title or document.meta.get('title') document.file_name = file_name or document.meta.get('file_name') document.mime_type = mime_type or document.meta.get('mime_type') from aleph.ingest import ingest_document ingest_document(document, file_path, user_queue=parent.user_queue)
def ingest(self, file_path): """Ingestor implementation.""" self.result.flag(self.result.FLAG_DIRECTORY) file_path = decode_path(file_path) if file_path is None or not os.path.isdir(file_path): return for name in os.listdir(file_path): name = decode_path(name) if name in self.SKIP_ENTRIES: continue sub_path = join_path(file_path, name) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, sub_path, file_name=name, id=child_id)
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) assert id is not None, (parent, file_path) doc = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id) doc.title = title or doc.meta.get('title') doc.file_name = file_name or doc.meta.get('file_name') doc.mime_type = mime_type or doc.meta.get('mime_type') from aleph.logic.documents.ingest import ingest_document ingest_document(doc, file_path, role_id=parent.document.uploader_id) return DocumentResult(self, doc, file_path=file_path)
def ingest(self, file_path): """Ingestor implementation.""" if not os.path.isdir(file_path): raise ProcessingException("Not a directory.") self.result.flag(self.result.FLAG_DIRECTORY) for name in os.listdir(file_path): name = decode_path(name) if name in self.SKIP_ENTRIES: continue sub_path = join_path(file_path, name) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, sub_path, file_name=name, id=child_id)
def crawldir(path, language=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(os.path.abspath(os.path.normpath(path))) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = create_collection(foreign_id, { 'label': path_name, 'languages': language }) log.info('Crawling %s to %s...', path, foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name db.session.commit() ingest_document(document, path)