def extend(d): try: db.insert_list(table_name, d) db.flush() Log.note("added {{num}} records", {"num":len(d)}) except Exception, e: Log.warning("Can not inert into database", e)
def __init__(self, pushlog_settings): with Timer("get pushlog"): if pushlog_settings.disable: all_pushlogs = [] else: with DB(pushlog_settings) as db: all_pushlogs = db.query(""" SELECT pl.`date`, left(ch.node, 12) revision, coalesce(bm.alt_name, br.name) branch FROM changesets ch LEFT JOIN pushlogs pl ON pl.id = ch.pushlog_id LEFT JOIN branches br ON br.id = pl.branch_id LEFT JOIN branch_map bm ON br.id = bm.id WHERE pl.date > {{oldest_date}} """, {"oldest_date": TOO_OLD}) Log.note("Got pushlog, now indexing...") self.pushlog = wrap(Q.index(all_pushlogs, ["branch", "revision"])._data) self.locker = Lock() self.unknown_branches = set()
def main(): try: settings = startup.read_settings(filename="file2db_settings.json") Log.start(settings.debug) with DB(settings.db) as db: db.execute(""" DROP TABLE IF EXISTS b2g_tests """) db.execute(""" CREATE TABLE b2g_tests ( id INTEGER PRIMARY KEY NOT NULL, branch VARCHAR(100), name VARCHAR(100), version VARCHAR(100), suite varchar(200), revision varchar(100), `date` LONG ) """) file2db(db, "b2g_tests", settings.source_file) except Exception, e: Log.error("can not seem to startup", e)
def check_for_errors(self, logs, path): try: errors = [l for l in logs if l.type == "ERROR"] if errors: Log.error("Problem found in {{page}}:\n{{error|indent}}", { "page": path, "error": errors[0] }) finally: self.close()
def extract_from_datazilla_using_id(es, settings, transformer): existing_ids = get_existing_ids(es, settings, transformer.pushlog.keys()) max_existing_id = nvl(MAX(existing_ids), settings.production.min) holes = set(range(settings.production.min, max_existing_id)) - existing_ids missing_ids = set(range(settings.production.min, max_existing_id+nvl(settings.production.step, NUM_PER_BATCH))) - existing_ids Log.note("Number missing: {{num}}", {"num": len(missing_ids)}) Log.note("Number in holes: {{num}}", {"num": len(holes)}) #FASTER IF NO INDEXING IS ON es.set_refresh_interval(-1) #FILE IS FASTER THAN NETWORK if (len(holes) > 10000 or settings.args.scan_file or settings.args.restart) and File(settings.param.output_file).exists: #ASYNCH PUSH TO ES IN BLOCKS OF 1000 with Timer("Scan file for missing ids"): with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as json_for_es: num = 0 for line in File(settings.param.output_file): try: if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) # if id==3003529: # Log.debug() if id < settings.production.min: continue if id in existing_ids: continue if num > settings.production.step: return num += 1 with Profiler("decode and transform"): data = CNV.JSON2object(col[-1]) if data.test_run_id: with Profiler("transform"): data = transformer.transform(id, data) json_for_es.extend({"value": d} for d in data) Log.note("Added {{id}} from file", {"id": id}) existing_ids.add(id) else: Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id}) num -= 1 except Exception, e: Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", { "id": id, "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) missing_ids = missing_ids - existing_ids
def get_existing_ids(es, settings, branches): #FIND WHAT'S IN ES bad_ids = [] int_ids = set() demand_pushlog = {"match_all":{}} if branches: demand_pushlog = {"or": [ {"not": {"missing": {"field": "test_build.push_date"}}}, {"not": {"missing": {"field": "test_build.no_pushlog"}}} ]} if settings.elasticsearch.debug and settings.production.step < 10: # SIMPLY RELOAD THIS SMALL NUMBER return set([]) with ESQuery(es) as esq: max_id = esq.query({ "from": es.settings.alias, "select": {"value": "datazilla.id", "aggregate": "max"} }) interval_size = 200000 for mini, maxi in Q.intervals(settings.production.min, max_id+interval_size, interval_size): existing_ids = es.search({ "query": { "filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"range": {"datazilla.id": {"gte": mini, "lt": maxi}}}, demand_pushlog ]} } }, "from": 0, "size": 0, "sort": [], "facets": { "ids": {"terms": {"field": "datazilla.id", "size": interval_size}} } }) for t in existing_ids.facets.ids.terms: try: int_ids.add(int(t.term)) except Exception, e: bad_ids.append(t.term) existing_ids = int_ids Log.println("Number of ids in ES: " + str(len(existing_ids))) Log.println("BAD ids in ES: " + str(bad_ids)) return existing_ids
def arrays_add(id, path, r): try: if isinstance(r, dict): for k, v in [(k, v) for k, v in r.items()]: new_path = path + "[" + k + "]" arrays_add(id, new_path, v) elif isinstance(r, list): try: values = r.map(float) arrays.append([id, path, len(values), 1]) except Exception, e: for i, v in enumerate(r): r[i] = arrays_add(id, path + "[" + str(i) + "]", v) # return r except Exception, e: Log.warning("Can not summarize: {{json}}", {"json": CNV.object2JSON(r)})
def main(): try: settings = startup.read_settings(defs=[{ "name": ["--no_restart", "--no_reset", "--no_redo", "--norestart", "--noreset", "--noredo"], "help": "do not allow creation of new index (for debugging rouge resets)", "action": "store_true", "dest": "no_restart" }, { "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }, { "name": ["--file", "--scan_file", "--scanfile", "--use_file", "--usefile"], "help": "scan file for missing ids", "action": "store_true", "dest": "scan_file" }, { "name": ["--nofile", "--no_file", "--no-file"], "help": "do not scan file for missing ids", "action": "store_false", "dest": "scan_file" }]) Log.start(settings.debug) with startup.SingleInstance(flavor_id=settings.args.filename): settings.production.threads = nvl(settings.production.threads, 1) settings.param.output_file = nvl(settings.param.output_file, "./results/raw_json_blobs.tab") transformer = DZ_to_ES(settings.pushlog) #RESET ONLY IF NEW Transform IS USED if settings.args.restart: es = Cluster(settings.elasticsearch).create_index(settings.elasticsearch) es.add_alias() es.delete_all_but_self() extract_from_datazilla_using_id(es, settings, transformer) else: es = Cluster(settings.elasticsearch).get_or_create_index(settings.elasticsearch) extract_from_datazilla_using_id(es, settings, transformer) except Exception, e: Log.error("Problem with etl", e)
def _wait_for_stable(self, detect_function, timeout): """ WAIT FOR RESULTS OF detect_function TO BE STABLE """ if not isinstance(timeout, timedelta): Log.error("Expecting a timeout as a timedelta") detectTime = Date.now() oldValue = "probably never an initial value" newValue = detect_function() while True: now = Date.now() potentialValue = detect_function() if potentialValue != newValue: oldValue = newValue newValue = potentialValue detectTime = now if now - detectTime > timeout: return Thread.sleep(seconds=0.5)
def wait_for_logs(self, timeout=None): if not timeout: timeout = timedelta(seconds=10) def logs(): return self.find("#" + LOG_DIV + " p") def status(): s = self.find("#status") if not s: return None return s[0].text # IF THE MESSAGE KEEPS CHANGING OR THE LOGS KEEP INCREASING WE CAN BE # CONFIDENT SOMETHING IMPORTANT IS STILL HAPPENING self._wait_for_stable(lambda: (status(), len(logs())), timeout) output = [CNV.JSON2object(CNV.html2unicode(e.get_attribute('innerHTML'))) for e in logs()] Log.note("Logs:\n{{logs|indent}}", {"logs": output}) return output
def wait_for_logs(self, timeout=None): if not timeout: timeout = timedelta(seconds=10) def logs(): return self.find("#" + LOG_DIV + " p") def status(): s = self.find("#status") if not s: return None return s[0].text # IF THE MESSAGE KEEPS CHANGING OR THE LOGS KEEP INCREASING WE CAN BE # CONFIDENT SOMETHING IMPORTANT IS STILL HAPPENING self._wait_for_stable(lambda: (status(), len(logs())), timeout) output = [ CNV.JSON2object(CNV.html2unicode(e.get_attribute('innerHTML'))) for e in logs() ] Log.note("Logs:\n{{logs|indent}}", {"logs": output}) return output
def etl(es_sink, file_sink, settings, transformer, max_id, id): """ PULL FROM DZ AND PUSH TO es AND file_sink """ # DEBUG GROWTH # with GC_LOCKER: # try: # if COUNTER.count % 100 == 0: # # gc.collect() # deltas, stats = objgraph.get_growth() # Log.note("Deltas:\n{{deltas|indent}}", {"deltas": deltas}) # except Exception, e: # Log.warning("objgraph problem", e) # # COUNTER.count += 1 url = settings.production.blob_url + "/" + str(id) try: with Timer("read {{id}} from DZ", {"id": id}): content = requests.get(url, timeout=nvl(settings.production.timeout, 30)).content except Exception, e: Log.warning("Failure to read from {{url}}", {"url": url}, e) return False
# except Exception, e: # Log.warning("objgraph problem", e) # # COUNTER.count += 1 url = settings.production.blob_url + "/" + str(id) try: with Timer("read {{id}} from DZ", {"id": id}): content = requests.get(url, timeout=nvl(settings.production.timeout, 30)).content except Exception, e: Log.warning("Failure to read from {{url}}", {"url": url}, e) return False try: if content.startswith("Id not found"): Log.note("{{id}} not found {{url}}", {"id": id, "url": url}) if id < max_id: return True else: return False data = CNV.JSON2object(content.decode('utf-8')) content = CNV.object2JSON(data) #ENSURE content HAS NO crlf if data.test_run_id: Log.println("Add {{id}} for revision {{revision}} ({{bytes}} bytes)", { "id": id, "revision": data.json_blob.test_build.revision, "bytes": len(content) }) with Profiler("transform"):
new_path = path + "[" + k + "]" arrays_add(id, new_path, v) elif isinstance(r, list): try: values = r.map(float) arrays.append([id, path, len(values), 1]) except Exception, e: for i, v in enumerate(r): r[i] = arrays_add(id, path + "[" + str(i) + "]", v) # return r except Exception, e: Log.warning("Can not summarize: {{json}}", {"json": CNV.object2JSON(r)}) settings = startup.read_settings() Log.settings(settings.debug) all = set() with open(settings.output_file, "r") as input_file: with open("good_talos.tab", "w") as output_file: for line in input_file: try: if len(line.strip()) == 0: continue col = line.split("\t") id = int(col[0]) if id < MINIMUM_ID: continue json = col[1]
def check_if_still_loading(self, path): # IF SPINNER STILL SHOWS, THEN WE GOT LOADING ISSUES isLoading = OR([e.is_displayed() for e in self.find(".loading")]) if isLoading: Log.error("page still loading: {{page}}", {"page": path})
id = int(col[0]) if id in added: continue added.add(id) data = CNV.JSON2object(col[1]) records_for_db.add({ "id": nvl(data.test_run_id, id), "branch": data.json_blob.test_build.branch, "name": data.json_blob.test_build.name, "version": data.json_blob.test_build.version, "suite": data.json_blob.testrun.suite, "revision": data.json_blob.test_build.revision, "date": data.json_blob.testrun.date }) Log.note("Added {{id}} from file", {"id": data.test_run_id}) except Exception, e: Log.warning("Bad line ({{length}}bytes):\n\t{{prefix}}", { "length": len(CNV.object2JSON(line)), "prefix": CNV.object2JSON(line)[0:130] }, e) def main(): try: settings = startup.read_settings(filename="file2db_settings.json") Log.start(settings.debug) with DB(settings.db) as db:
test_machine=r.test_machine, datazilla=r.datazilla, testrun=r.testrun, test_build=r.test_build, result={ "test_name": test_name, "ordering": i, "samples": replicates } ) try: s = stats(replicates) new_record.result.stats = s total.append(s) except Exception, e: Log.warning("can not reduce series to moments", e) new_records.append(new_record) if len(total) > 1: # ADD RECORD FOR GEOMETRIC MEAN SUMMARY new_record = Struct( test_machine=r.test_machine, datazilla=r.datazilla, testrun=r.testrun, test_build=r.test_build, result={ "test_name": "SUMMARY", "ordering": -1, "stats": geo_mean(total) }
def transform(self, id, datazilla): try: r = datazilla.json_blob #ADD DATAZILLA MARKUP r.datazilla = { "id": id, "date_loaded": datazilla.date_loaded * 1000, "error_flag": datazilla.error_flag, "test_run_id": datazilla.test_run_id, "processed_flag": datazilla.processed_flag, "error_msg": datazilla.error_msg } #CONVERT UNIX TIMESTAMP TO MILLISECOND TIMESTAMP r.testrun.date *= 1000 def mainthread_transform(r): if r == None: return None output = Struct() for i in r.mainthread_readbytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readbytes = i[0] r.mainthread_readbytes = None for i in r.mainthread_writebytes: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writebytes = i[0] r.mainthread_writebytes = None for i in r.mainthread_readcount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].readcount = i[0] r.mainthread_readcount = None for i in r.mainthread_writecount: output[literal_field(i[1])].name = i[1] output[literal_field(i[1])].writecount = i[0] r.mainthread_writecount = None r.mainthread = output.values() mainthread_transform(r.results_aux) mainthread_transform(r.results_xperf) #ADD PUSH LOG INFO try: branch = r.test_build.branch if branch.endswith("-Non-PGO"): r.test_build.branch = branch r.test_build.pgo = False branch = branch[0:-8] else: r.test_build.pgo = True with Profiler("get from pushlog"): if not self.pushlog: #NO PUSHLOG MEANS WE DO NOTHING TO MARKUP TEST RESULTS pass elif self.pushlog[branch]: possible_dates = self.pushlog[branch][r.test_build.revision] if possible_dates: r.test_build.push_date = int(Math.round(possible_dates[0].date * 1000)) else: if r.test_build.revision == 'NULL': r.test_build.no_pushlog = True # OOPS! SOMETHING BROKE elif CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD: Log.note("{{branch}} @ {{revision}} has no pushlog, transforming anyway", r.test_build) r.test_build.no_pushlog = True else: Log.note("{{branch}} @ {{revision}} has no pushlog, try again later", r.test_build) return [] # TRY AGAIN LATER else: with self.locker: if branch not in self.unknown_branches: Log.note("Whole branch {{branch}} has no pushlog", {"branch":branch}) self.unknown_branches.add(branch) if CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD: r.test_build.no_pushlog = True else: r.test_build.no_pushlog = True #return [r] #TODO: DO THIS IF WE FIGURE OUT HOW TO HANDLE THE VERY LARGE NUMBER OF RESULTS WITH NO PUSHLOG except Exception, e: Log.warning("{{branch}} @ {{revision}} has no pushlog", r.test_build, e) new_records = [] # RECORD THE UNKNOWN PART OF THE TEST RESULTS remainder = r.copy() remainder.results = None if len(remainder.keys()) > 4: new_records.append(remainder) #RECORD TEST RESULTS total = StructList() if r.testrun.suite in ["dromaeo_css", "dromaeo_dom"]: #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE #RECORD ALL RESULTS for i, (test_name, replicates) in enumerate(r.results.items()): for g, sub_results in Q.groupby(replicates, size=5): new_record = Struct( test_machine=r.test_machine, datazilla=r.datazilla, testrun=r.testrun, test_build=r.test_build, result={ "test_name": unicode(test_name) + "." + unicode(g), "ordering": i, "samples": sub_results } ) try: s = stats(sub_results) new_record.result.stats = s total.append(s) except Exception, e: Log.warning("can not reduce series to moments", e) new_records.append(new_record)
def __del__(self): try: Log.println("Branches missing from pushlog:\n{{list}}", {"list": self.unknown_branches}) except Exception, e: pass