def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("query-export.zip") with ZipFile(file_path, mode="w") as zf: excel_path = export_dir.joinpath(EXCEL_FILE) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for entity in iter_proxies(filters=filters): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= Export.MAX_FILE_SIZE: log.warn("Export too large: %r", export) break exporter.finalize() zf.write(excel_path, arcname=EXCEL_FILE) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def get_collection(collection_id): key = cache.object_key(Collection, collection_id) data = cache.get_complex(key) if data is None: data = index.get_collection(collection_id) cache.set_complex(key, data, expire=cache.EXPIRE) return data
def _iter_match_batch(batch, authz): matchable = [s.name for s in model if s.matchable] entities = set() for match in batch: entities.add(match.entity_id) entities.add(match.match_id) entities = entities_by_ids(list(entities), schemata=matchable) entities = {e.get('id'): e for e in entities} for obj in batch: if not authz.can(obj.match_collection_id, authz.READ): continue entity = entities.get(str(obj.entity_id)) match = entities.get(str(obj.match_id)) collection = get_collection(obj.match_collection_id) if entity is None or match is None or collection is None: continue eproxy = model.get_proxy(entity) mproxy = model.get_proxy(match) yield ( int(obj.score * 100), eproxy.caption, _format_date(eproxy), _format_country(eproxy), collection.get('label'), mproxy.caption, _format_date(mproxy), _format_country(mproxy), entity_url(eproxy.id), entity_url(mproxy.id), )
def _iter_match_batch(batch, authz): matchable = [s.name for s in model if s.matchable] entities = set() for match in batch: entities.add(match.entity_id) entities.add(match.match_id) entities = entities_by_ids(list(entities), schemata=matchable) entities = {e.get('id'): e for e in entities} for obj in batch: if not authz.can(obj.match_collection_id, authz.READ): continue entity = entities.get(str(obj.entity_id)) match = entities.get(str(obj.match_id)) collection = get_collection(obj.match_collection_id) if entity is None or match is None or collection is None: continue eproxy = model.get_proxy(entity) mproxy = model.get_proxy(match) yield ( int(obj.score * 100), eproxy.caption, _format_date(eproxy), _format_country(eproxy), collection.get('label'), mproxy.caption, _format_date(mproxy), _format_country(mproxy), entity_url(eproxy.id), entity_url(mproxy.id), )
def export_triples(outfile, collection_id=None, entity_id=None): if entity_id is not None: entity = get_entity(entity_id) collection = collection_uri(entity['collection_id']) export_entity(outfile, entity, collection) elif collection_id is not None: collection = get_collection(collection_id) export_collection(outfile, collection) else: export_collections(outfile)
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("export.zip") with ZipFile(file_path, mode="w") as zf: excel_name = safe_filename(export.label, extension="xlsx") excel_path = export_dir.joinpath(excel_name) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for idx, entity in enumerate(iter_proxies(filters=filters)): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE: concern = "total size of the" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break if idx >= settings.EXPORT_MAX_RESULTS: concern = "number of" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break exporter.finalize() zf.write(excel_path, arcname=excel_name) file_name = "Export: %s" % export.label file_name = safe_filename(file_name, extension="zip") complete_export(export_id, file_path, file_name) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)