def extract_from_datazilla_using_id(es, settings, transformer): existing_ids = get_existing_ids(es, settings, transformer.pushlog.keys()) max_existing_id = nvl(MAX(existing_ids), settings.production.min) holes = set(range(settings.production.min, max_existing_id)) - existing_ids missing_ids = set(range(settings.production.min, max_existing_id+nvl(settings.production.step, NUM_PER_BATCH))) - existing_ids Log.note("Number missing: {{num}}", {"num": len(missing_ids)}) Log.note("Number in holes: {{num}}", {"num": len(holes)}) #FASTER IF NO INDEXING IS ON es.set_refresh_interval(-1) #FILE IS FASTER THAN NETWORK if (len(holes) > 10000 or settings.args.scan_file or settings.args.restart) and File(settings.param.output_file).exists: #ASYNCH PUSH TO ES IN BLOCKS OF 1000 with Timer("Scan file for missing ids"): with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as json_for_es: num = 0 for line in File(settings.param.output_file): try: if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) # if id==3003529: # Log.debug() if id < settings.production.min: continue if id in existing_ids: continue if num > settings.production.step: return num += 1 with Profiler("decode and transform"): data = CNV.JSON2object(col[-1]) if data.test_run_id: with Profiler("transform"): data = transformer.transform(id, data) json_for_es.extend({"value": d} for d in data) Log.note("Added {{id}} from file", {"id": id}) existing_ids.add(id) else: Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id}) num -= 1 except Exception, e: Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", { "id": id, "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) missing_ids = missing_ids - existing_ids
def arrays_add(id, path, r): try: if isinstance(r, dict): for k, v in [(k, v) for k, v in r.items()]: new_path = path + "[" + k + "]" arrays_add(id, new_path, v) elif isinstance(r, list): try: values = r.map(float) arrays.append([id, path, len(values), 1]) except Exception, e: for i, v in enumerate(r): r[i] = arrays_add(id, path + "[" + str(i) + "]", v) # return r except Exception, e: Log.warning("Can not summarize: {{json}}", {"json": CNV.object2JSON(r)})
added.add(id) data = CNV.JSON2object(col[1]) records_for_db.add({ "id": nvl(data.test_run_id, id), "branch": data.json_blob.test_build.branch, "name": data.json_blob.test_build.name, "version": data.json_blob.test_build.version, "suite": data.json_blob.testrun.suite, "revision": data.json_blob.test_build.revision, "date": data.json_blob.testrun.date }) Log.note("Added {{id}} from file", {"id": data.test_run_id}) except Exception, e: Log.warning("Bad line ({{length}}bytes):\n\t{{prefix}}", { "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) def main(): try: settings = startup.read_settings(filename="file2db_settings.json") Log.start(settings.debug) with DB(settings.db) as db: db.execute(""" DROP TABLE IF EXISTS b2g_tests """)
with Timer("read {{id}} from DZ", {"id": id}): content = requests.get(url, timeout=nvl(settings.production.timeout, 30)).content except Exception, e: Log.warning("Failure to read from {{url}}", {"url": url}, e) return False try: if content.startswith("Id not found"): Log.note("{{id}} not found {{url}}", {"id": id, "url": url}) if id < max_id: return True else: return False data = CNV.JSON2object(content.decode('utf-8')) content = CNV.object2JSON(data) #ENSURE content HAS NO crlf if data.test_run_id: Log.println("Add {{id}} for revision {{revision}} ({{bytes}} bytes)", { "id": id, "revision": data.json_blob.test_build.revision, "bytes": len(content) }) with Profiler("transform"): result = transformer.transform(id, data) if result: Log.println("{{num}} records to add", { "num": len(result) }) es_sink.extend({"value": d} for d in result)