def reingest_collection(collection, job_id=None, index=False, flush=True): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() if flush: ingest_flush(collection) for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def reingest_collection(collection, job_id=None, index=False): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() aggregator = get_aggregator(collection) aggregator.delete(origin=OP_ANALYZE) aggregator.delete(origin=OP_INGEST) aggregator.close() for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def generate_collection_docs(collection): q = Document.by_collection(collection.id) q = q.order_by(Document.id.asc()) for idx, document in enumerate(q.yield_per(BULK_PAGE)): try: log.info("Index [%s]: %s", document.id, document.name) yield from generate_document(document) except Exception: log.exception("Cannot index [%s]: %s", document.id, document.name) if idx % 1000 == 0: db.session.expunge_all()
def aggregate_model(collection, aggregator): """Sync up the aggregator from the Aleph domain model.""" log.debug("[%s] Aggregating model...", collection) aggregator.delete(origin=MODEL_ORIGIN) writer = aggregator.bulk() for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) writer.put(proxy, fragment="db", origin=MODEL_ORIGIN) for entity in Entity.by_collection(collection.id): proxy = entity.to_proxy() aggregator.delete(entity_id=proxy.id) writer.put(proxy, fragment="db", origin=MODEL_ORIGIN) writer.flush()
def _export_balkhash_collection(collection, retries=0, backoff=30, offset=0): MAX_RETRIES = 5 RETRY_BACKOFF_FACTOR = 2 try: from followthemoney import model dataset = get_dataset(collection.foreign_id) writer = dataset.bulk() q = Document.by_collection(collection.id) q = q.order_by(Document.id.asc()).offset(offset) for doc in q.yield_per(5000): log.debug("Export [%s:%s]: %s", doc.id, doc.schema, doc.name) dproxy = doc.to_proxy() writer.put(dproxy) if doc.supports_records: q = db.session.query(DocumentRecord) q = q.filter(DocumentRecord.document_id == doc.id) for record in q.yield_per(100): rproxy = record.to_proxy() writer.put(rproxy) dpart = model.make_entity(doc.schema) dpart.id = dproxy.id dpart.add('indexText', list(record.texts)) writer.put(dpart, fragment=str(record.id)) offset += 1 dataset.close() except DBAPIError as exc: if retries < MAX_RETRIES: log.debug("Error occurred: %s", exc) log.debug("Retrying in %s seconds", backoff) db.session.close() dataset.close() time.sleep(backoff) retries = retries + 1 backoff = backoff * RETRY_BACKOFF_FACTOR return _export_balkhash_collection(collection, retries, backoff, offset) else: log.exception(exc)
def _proxies(collection): for entity in Entity.by_collection(collection.id).yield_per(5000): yield entity.to_proxy() for document in Document.by_collection(collection.id).yield_per(5000): yield document.to_proxy()