def extract_from_datazilla_using_id(es, settings, transformer): existing_ids = get_existing_ids(es, settings, transformer.pushlog.keys()) max_existing_id = nvl(MAX(existing_ids), settings.production.min) holes = set(range(settings.production.min, max_existing_id)) - existing_ids missing_ids = set(range(settings.production.min, max_existing_id+nvl(settings.production.step, NUM_PER_BATCH))) - existing_ids Log.note("Number missing: {{num}}", {"num": len(missing_ids)}) Log.note("Number in holes: {{num}}", {"num": len(holes)}) #FASTER IF NO INDEXING IS ON es.set_refresh_interval(-1) #FILE IS FASTER THAN NETWORK if (len(holes) > 10000 or settings.args.scan_file or settings.args.restart) and File(settings.param.output_file).exists: #ASYNCH PUSH TO ES IN BLOCKS OF 1000 with Timer("Scan file for missing ids"): with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as json_for_es: num = 0 for line in File(settings.param.output_file): try: if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) # if id==3003529: # Log.debug() if id < settings.production.min: continue if id in existing_ids: continue if num > settings.production.step: return num += 1 with Profiler("decode and transform"): data = CNV.JSON2object(col[-1]) if data.test_run_id: with Profiler("transform"): data = transformer.transform(id, data) json_for_es.extend({"value": d} for d in data) Log.note("Added {{id}} from file", {"id": id}) existing_ids.add(id) else: Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id}) num -= 1 except Exception, e: Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", { "id": id, "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) missing_ids = missing_ids - existing_ids
def geo_mean(values): """ GIVEN AN ARRAY OF dicts, CALC THE GEO-MEAN ON EACH ATTRIBUTE """ agg = Struct() for d in values: for k, v in d.items(): if v != 0: agg[k] = nvl(agg[k], ZeroMoment.new_instance()) + Math.log(Math.abs(v)) return {k: Math.exp(v.stats.mean) for k, v in agg.items()}
def main(): try: settings = startup.read_settings(defs=[{ "name": ["--no_restart", "--no_reset", "--no_redo", "--norestart", "--noreset", "--noredo"], "help": "do not allow creation of new index (for debugging rouge resets)", "action": "store_true", "dest": "no_restart" }, { "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }, { "name": ["--file", "--scan_file", "--scanfile", "--use_file", "--usefile"], "help": "scan file for missing ids", "action": "store_true", "dest": "scan_file" }, { "name": ["--nofile", "--no_file", "--no-file"], "help": "do not scan file for missing ids", "action": "store_false", "dest": "scan_file" }]) Log.start(settings.debug) with startup.SingleInstance(flavor_id=settings.args.filename): settings.production.threads = nvl(settings.production.threads, 1) settings.param.output_file = nvl(settings.param.output_file, "./results/raw_json_blobs.tab") transformer = DZ_to_ES(settings.pushlog) #RESET ONLY IF NEW Transform IS USED if settings.args.restart: es = Cluster(settings.elasticsearch).create_index(settings.elasticsearch) es.add_alias() es.delete_all_but_self() extract_from_datazilla_using_id(es, settings, transformer) else: es = Cluster(settings.elasticsearch).get_or_create_index(settings.elasticsearch) extract_from_datazilla_using_id(es, settings, transformer) except Exception, e: Log.error("Problem with etl", e)
def etl(es_sink, file_sink, settings, transformer, max_id, id): """ PULL FROM DZ AND PUSH TO es AND file_sink """ # DEBUG GROWTH # with GC_LOCKER: # try: # if COUNTER.count % 100 == 0: # # gc.collect() # deltas, stats = objgraph.get_growth() # Log.note("Deltas:\n{{deltas|indent}}", {"deltas": deltas}) # except Exception, e: # Log.warning("objgraph problem", e) # # COUNTER.count += 1 url = settings.production.blob_url + "/" + str(id) try: with Timer("read {{id}} from DZ", {"id": id}): content = requests.get(url, timeout=nvl(settings.production.timeout, 30)).content except Exception, e: Log.warning("Failure to read from {{url}}", {"url": url}, e) return False
with ThreadedQueue(db_queue, 100) as records_for_db: added = set() for line in File(filename).iter(): try: if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) if id in added: continue added.add(id) data = CNV.JSON2object(col[1]) records_for_db.add({ "id": nvl(data.test_run_id, id), "branch": data.json_blob.test_build.branch, "name": data.json_blob.test_build.name, "version": data.json_blob.test_build.version, "suite": data.json_blob.testrun.suite, "revision": data.json_blob.test_build.revision, "date": data.json_blob.testrun.date }) Log.note("Added {{id}} from file", {"id": data.test_run_id}) except Exception, e: Log.warning("Bad line ({{length}}bytes):\n\t{{prefix}}", { "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e)
existing_ids.add(id) else: Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id}) num -= 1 except Exception, e: Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", { "id": id, "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) missing_ids = missing_ids - existing_ids #COPY MISSING DATA TO ES try: with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as es_sink: with ThreadedQueue(File(settings.param.output_file), size=50) as file_sink: simple_etl = functools.partial(etl, *[es_sink, file_sink, settings, transformer, max_existing_id]) num_not_found = 0 with Multithread(simple_etl, threads=settings.production.threads) as many: results = many.execute([ {"id": id} for id in Q.sort(missing_ids)[:nvl(settings.production.step, NUM_PER_BATCH):] ]) for result in results: if not result: num_not_found += 1 if num_not_found > nvl(settings.production.max_tries, 10): many.inbound.pop_all() # CLEAR THE QUEUE OF OTHER WORK many.stop()