def crawldir(directory, language=None, country=None, foreign_id=None): """Crawl the given directory.""" if directory is None or not os.path.exists(directory): log.error("Invalid directory: %r", directory) return directory = os.path.abspath(os.path.normpath(directory)) if foreign_id is None: foreign_id = 'directory:%s' % slugify(directory) collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': directory, 'managed': True }) if language is not None: collection.languages = [language] if country is not None: collection.countries = [country] db.session.commit() update_collection(collection) log.info('Crawling %r to %r...', directory, collection.foreign_id) document = Document.by_keys(collection=collection, foreign_id=directory) ingest_document(document, directory)
def crawldir(path, language=None, country=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(path) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path = os.path.abspath(os.path.normpath(path)) path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': path_name, 'managed': True }) if language is not None: collection.languages = [language] if country is not None: collection.countries = [country] db.session.commit() update_collection(collection) log.info('Crawling %r to %r...', path, collection.foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name ingest_document(document, path)
def ingest_upload(id): collection = obj_or_404(Collection.by_id(id)) require(request.authz.can_write(collection.id)) try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) validate_data(meta, DocumentSchema) documents = [] for storage in request.files.values(): sec_fn = os.path.join(upload_folder, secure_filename(storage.filename)) storage.save(sec_fn) content_hash = checksum(sec_fn) document = Document.by_keys(collection=collection, content_hash=content_hash) document.mime_type = storage.mimetype document.file_name = storage.filename document.update(meta) ingest_document(document, sec_fn, role_id=request.authz.id) os.unlink(sec_fn) documents.append(document) return jsonify({ 'status': 'ok', 'documents': [DocumentSchema().dump(d).data for d in documents] })
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) request.authz.require(request.authz.collection_write(collection.id)) log_event(request) crawler_run = make_textid() try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) documents = [] for storage in request.files.values(): sec_fn = os.path.join(upload_folder, secure_filename(storage.filename)) storage.save(sec_fn) content_hash = checksum(sec_fn) document = Document.by_keys(collection=collection, content_hash=content_hash) document.crawler = 'user_upload:%s' % request.authz.role.id document.crawler_run = crawler_run document.mime_type = storage.mimetype document.file_name = storage.filename try: meta = json.loads(request.form.get('meta', '{}')) validate(meta, 'metadata.json#') document.meta.update(meta) except Exception as ex: raise BadRequest(unicode(ex)) ingest_document(document, sec_fn, user_queue=True) os.unlink(sec_fn) documents.append(document) return jsonify({'status': 'ok', 'documents': documents})
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection=self.collection, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection=self.collection, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def emit_file(self, document, file_path, move=False): if isinstance(document, CrawlerMetadata): doc = self.create_document(foreign_id=document.foreign_id, content_hash=document.content_hash) doc.meta.update(document.meta) document = doc ingest_document(document, file_path) self.increment_count()
def ingest_document(self, document, file_path=None, role_id=None, shallow=False): """Ingest a database-backed document. First retrieve its data and then call the actual ingestor. """ # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") content_hash = document.content_hash if file_path is None and content_hash is not None: file_name = document.safe_file_name file_path = self.archive.load_file(content_hash, file_name=file_name, temp_path=work_path) if file_path is not None and not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.error("Invalid path [%r]: %s", document, file_path) return try: if not len(document.languages): document.languages = document.collection.languages if not len(document.countries): document.countries = document.collection.countries result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) if not shallow and file_path is None: # When a directory is ingested, the data is not stored. Thus, # try to recurse on the database-backed known children. for child in Document.by_parent(document): from aleph.ingest import ingest_document ingest_document(child, None, role_id=role_id) finally: db.session.rollback() # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def ingest_upload(id): collection = obj_or_404(Collection.by_id(id)) require(request.authz.can_write(collection.id)) meta, foreign_id = _load_metadata() parent_id = _load_parent(collection, meta) upload_dir = mkdtemp() try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename) path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.mime_type = storage.mimetype if storage.filename: document.file_name = os.path.basename(storage.filename) document.update(meta) ingest_document(document, path, role_id=request.authz.id) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.update(meta) ingest_document(document, upload_dir, role_id=request.authz.id) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.delay(parent_id) return jsonify({ 'status': 'ok', 'documents': [DocumentSchema().dump(d).data for d in documents] })
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) ingest_document(document, path, role_id=request.authz.id) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) ingest_document(document, None, role_id=request.authz.id, shallow=True) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) assert id is not None, (parent, file_path) doc = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id) doc.title = title or doc.meta.get('title') doc.file_name = file_name or doc.meta.get('file_name') doc.mime_type = mime_type or doc.meta.get('mime_type') from aleph.ingest import ingest_document ingest_document(doc, file_path, role_id=parent.role_id) return DocumentResult(self, doc, file_path=file_path, role_id=parent.role_id)
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) file_name = decode_path(file_name) or os.path.basename(file_path) content_hash = None if not os.path.isdir(file_path): content_hash = checksum(file_path) document = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id, content_hash=content_hash) document.title = title or document.meta.get('title') document.file_name = file_name or document.meta.get('file_name') document.mime_type = mime_type or document.meta.get('mime_type') from aleph.ingest import ingest_document ingest_document(document, file_path, user_queue=parent.user_queue)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection=self.collection, foreign_id='experts.csv') db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def ingest_document(self, document, file_path=None, role_id=None, shallow=False): """Ingest a database-backed document. First retrieve its data and then call the actual ingestor. """ content_hash = document.content_hash if file_path is None and content_hash is not None: file_path = self.archive.load_file(content_hash, file_name=document.safe_file_name) # noqa if file_path is not None and not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.error("Invalid path [%r]: %s", document, file_path) return try: if document.collection is not None: if not len(document.languages): document.languages = document.collection.languages if not len(document.countries): document.countries = document.collection.countries result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) if not shallow and file_path is None: # When a directory is ingested, the data is not stored. Thus, # try to recurse transparently. for child in Document.by_parent(document): from aleph.ingest import ingest_document ingest_document(child, None, role_id=role_id) finally: db.session.rollback() if content_hash is not None: self.archive.cleanup_file(content_hash)