def full_etl(settings): schema = convert.json2value(convert.value2json(SCHEMA), leaves=True) Cluster(settings.destination).get_or_create_index(settings=settings.destination, schema=schema, limit_replicas=True) destq = FromES(settings.destination) if settings.incremental: min_bug_id = destq.query({ "from": coalesce(settings.destination.alias, settings.destination.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) min_bug_id = int(MAX(min_bug_id-1000, 0)) else: min_bug_id = 0 sourceq = FromES(settings.source) max_bug_id = sourceq.query({ "from": coalesce(settings.source.alias, settings.source.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) + 1 max_bug_id = int(coalesce(max_bug_id, 0)) # FIRST, GET ALL MISSING BUGS for s, e in qb.reverse(list(qb.intervals(min_bug_id, max_bug_id, 10000))): with Timer("pull {{start}}..{{end}} from ES", {"start": s, "end": e}): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked", "modified_ts", "expires_on"], "where": {"and": [ {"range": {"bug_id": {"gte": s, "lt": e}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 10000 }) with Timer("fixpoint work"): to_fix_point(settings, destq, children.data) # PROCESS RECENT CHANGES with Timer("pull recent dependancies from ES"): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked"], "where": {"and": [ {"range": {"modified_ts": {"gte": convert.datetime2milli(datetime.utcnow() - timedelta(days=7))}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 100000 }) to_fix_point(settings, destq, children.data)
def main(): settings = startup.read_settings(defs={ "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }) Log.start(settings.debug) try: with startup.SingleInstance(flavor_id=settings.args.filename): if settings.args.restart: reviews = Cluster(settings.destination).create_index(settings.destination) else: reviews = Cluster(settings.destination).get_proto(settings.destination) bugs = Cluster(settings.source).get_index(settings.source) with FromES(bugs) as esq: es_max_bug = esq.query({ "from": "private_bugs", "select": {"name": "max_bug", "value": "bug_id", "aggregate": "maximum"} }) #PROBE WHAT RANGE OF BUGS IS LEFT TO DO (IN EVENT OF FAILURE) with FromES(reviews) as esq: es_min_bug = esq.query({ "from": "reviews", "select": {"name": "min_bug", "value": "bug_id", "aggregate": "minimum"} }) batch_size = coalesce(bugs.settings.batch_size, settings.size, 1000) threads = coalesce(settings.threads, 4) Log.note(str(settings.min_bug)) min_bug = int(coalesce(settings.min_bug, 0)) max_bug = int(coalesce(settings.max_bug, Math.min(es_min_bug + batch_size * threads, es_max_bug))) with ThreadedQueue(reviews, batch_size=coalesce(reviews.settings.batch_size, 100)) as sink: func = functools.partial(full_etl, settings, sink) with Multithread(func, threads=threads) as m: m.inbound.silent = True Log.note("bugs from {{min}} to {{max}}, step {{step}}", { "min": min_bug, "max": max_bug, "step": batch_size }) m.execute(reversed([{"bugs": range(s, e)} for s, e in qb.intervals(min_bug, max_bug, size=1000)])) if settings.args.restart: reviews.add_alias() reviews.delete_all_but_self() finally: Log.stop()
def get_pending(es, since): result = es.search({ "query": {"match_all": {}}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"statistical": {"field": "bug_id"}}} }) max_bug = int(result.facets.default.max) pending_bugs = None for s, e in qb.intervals(0, max_bug + 1, 100000): Log.note("Collect history for bugs from {{start}}..{{end}}", {"start": s, "end": e}) result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"range": {"modified_ts": {"gte": convert.datetime2milli(since)}}}, {"range": {"bug_id": {"gte": s, "lte": e}}} ]} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) temp = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) if pending_bugs is None: pending_bugs = temp else: pending_bugs = pending_bugs + temp Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs