def main(): settings = startup.read_settings(defs={ "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }) Log.start(settings.debug) try: with startup.SingleInstance(flavor_id=settings.args.filename): if settings.args.restart: reviews = Cluster(settings.destination).create_index(settings.destination) else: reviews = Cluster(settings.destination).get_proto(settings.destination) bugs = Cluster(settings.source).get_index(settings.source) with FromES(bugs) as esq: es_max_bug = esq.query({ "from": "private_bugs", "select": {"name": "max_bug", "value": "bug_id", "aggregate": "maximum"} }) #PROBE WHAT RANGE OF BUGS IS LEFT TO DO (IN EVENT OF FAILURE) with FromES(reviews) as esq: es_min_bug = esq.query({ "from": "reviews", "select": {"name": "min_bug", "value": "bug_id", "aggregate": "minimum"} }) batch_size = coalesce(bugs.settings.batch_size, settings.size, 1000) threads = coalesce(settings.threads, 4) Log.note(str(settings.min_bug)) min_bug = int(coalesce(settings.min_bug, 0)) max_bug = int(coalesce(settings.max_bug, Math.min(es_min_bug + batch_size * threads, es_max_bug))) with ThreadedQueue(reviews, batch_size=coalesce(reviews.settings.batch_size, 100)) as sink: func = functools.partial(full_etl, settings, sink) with Multithread(func, threads=threads) as m: m.inbound.silent = True Log.note("bugs from {{min}} to {{max}}, step {{step}}", { "min": min_bug, "max": max_bug, "step": batch_size }) m.execute(reversed([{"bugs": range(s, e)} for s, e in qb.intervals(min_bug, max_bug, size=1000)])) if settings.args.restart: reviews.add_alias() reviews.delete_all_but_self() finally: Log.stop()
def main(): try: settings = startup.read_settings(defs=[{ "name": ["--no_restart", "--no_reset", "--no_redo", "--norestart", "--noreset", "--noredo"], "help": "do not allow creation of new index (for debugging rouge resets)", "action": "store_true", "dest": "no_restart" }, { "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }, { "name": ["--file", "--scan_file", "--scanfile", "--use_file", "--usefile"], "help": "scan file for missing ids", "action": "store_true", "dest": "scan_file" }, { "name": ["--nofile", "--no_file", "--no-file"], "help": "do not scan file for missing ids", "action": "store_false", "dest": "scan_file" }]) Log.start(settings.debug) with startup.SingleInstance(flavor_id=settings.args.filename): settings.production.threads = nvl(settings.production.threads, 1) settings.param.output_file = nvl(settings.param.output_file, "./results/raw_json_blobs.tab") transformer = DZ_to_ES(settings.pushlog) #RESET ONLY IF NEW Transform IS USED if settings.args.restart: es = Cluster(settings.elasticsearch).create_index(settings.elasticsearch) es.add_alias() es.delete_all_but_self() extract_from_datazilla_using_id(es, settings, transformer) else: es = Cluster(settings.elasticsearch).get_or_create_index(settings.elasticsearch) extract_from_datazilla_using_id(es, settings, transformer) except Exception, e: Log.error("Problem with etl", e)