def _execute_backlog(self): if not self.backlog: return (backlog, self.backlog) = (self.backlog, []) if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(b) except Exception as e: Log.error("Can not execute sql:\n{{sql}}", sql=sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def decrypt(data, _key): """ ACCEPT JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ from pyLibrary.queries import jx # Key and iv have not been generated or provided, bail out if _key is None: Log.error("Expecting a key") _input = convert.json2value(data) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(convert.base642bytearray(_input.salt)) raw = convert.base642bytearray(_input.data) out_data = bytearray() for _, e in jx.groupby(raw, size=16): out_data.extend(aes_cbc_256.decrypt_block(e)) return str(out_data[:_input.length:]).decode("utf8")
def _execute_backlog(self): if not self.backlog: return (backlog, self.backlog) = (self.backlog, []) if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(b) except Exception as e: Log.error("Can not execute sql:\n{{sql}}", sql= sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append(_deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e)
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Thread.sleep(seconds=1) messages = wrap(self.queue.pop_all()) if messages: # for m in messages: # m.value.params = leafer(m.value.params) # m.value.error = leafer(m.value.error) for g, mm in jx.groupby(messages, size=self.batch_size): self.es.extend(mm) bad_count = 0 except Exception, e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > 5: break
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ from pyLibrary.queries import jx if not isinstance(text, unicode): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Dict() output.type = "AES256" output.salt = convert.bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in jx.groupby(data, size=16): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = convert.bytes2base64(encrypted) json = convert.value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def encrypt(text, _key, salt=None): """ RETURN JSON OF ENCRYPTED DATA {"salt":s, "length":l, "data":d} """ from pyLibrary.queries import jx if not isinstance(text, unicode): Log.error("only unicode is encrypted") if _key is None: Log.error("Expecting a key") if isinstance(_key, str): _key = bytearray(_key) if salt is None: salt = Random.bytes(16) data = bytearray(text.encode("utf8")) # Initialize encryption using key and iv key_expander_256 = key_expander.KeyExpander(256) expanded_key = key_expander_256.expand(_key) aes_cipher_256 = aes_cipher.AESCipher(expanded_key) aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16) aes_cbc_256.set_iv(salt) output = Data() output.type = "AES256" output.salt = convert.bytes2base64(salt) output.length = len(data) encrypted = bytearray() for _, d in jx.groupby(data, size=16): encrypted.extend(aes_cbc_256.encrypt_block(d)) output.data = convert.bytes2base64(encrypted) json = convert.value2json(output) if DEBUG: test = decrypt(json, _key) if test != text: Log.error("problem with encryption") return json
def agg(today, destination, debug_filter=None, please_stop=None): """ :param today: The day we are performing the calculation for :param destination: The ES index where we put the results :param debug_filter: Some extra limitation to go faster, and focus, for testing :param please_stop: Signal for stopping early :return: nothing """ # GET LIST OF ALL TESTS, BY PLATFORM, TYPE, SUITE for suite in SUITES: domain = {"and": [ {"prefix": {"run.suite": suite}}, {"gt": {"build.date": (today - 3 * DAY).unix}}, {"lt": {"build.date": (today + 4 * DAY).unix}}, {"exists": "build.platform"}, {"not": {"in": {"build.platform": EXCLUDE_PLATFORMS}}}, {"not": {"in": {"build.branch": EXCLUDE_BRANCHES}}} ]} if debug_filter: domain['and'].append(debug_filter) _ = convert.value2json("\"\"") # WE CAN NOT PULL ALL TESTS, THERE ARE TOO MANY, SO DO ONE SUITE AT A TIME Log.note("Get summary of failures in {{suite}} for date {{date}}", suite=suite, date=today) suite_summary = http.post_json(config.source.url, json={ "from": "unittest", "groupby": [ {"name": "test", "value": "result.test"} ], "where": {"and": [ domain, {"eq": {"result.ok": False}} ]}, "format": "list", "limit": 100000 }) often_fail = jx.filter(suite_summary.data, {"gt": {"count": 1}}) for g, tests in jx.groupby(often_fail, size=100): tests = wrap(tests) if please_stop: return Log.note("Collect stats on {{num}} tests", num=len(tests)) tests_summary = http.post_json(config.source.url, json={ "from": "unittest", "groupby": [ "run.suite", {"name": "test", "value": "result.test"}, "build.platform", "build.product", "build.type", "run.type" ], "select": [ { "name": "date_fails", "value": { "mult": [ {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]}, {"when": "result.ok", "then": 0, "else": 1} ] }, "aggregate": "stats" }, { "name": "date", "value": {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]}, "aggregate": "stats" }, { "name": "fails", "value": {"when": "result.ok", "then": 0, "else": 1}, "aggregate": "stats" } ], "where": {"and": [ domain, {"in": {"result.test": tests}} ]}, "format": "list", "limit": 100000 }) # FOR EACH TEST, CALCULATE THE "RECENTLY BAD" STATISTIC (linear regression slope) # THIS IS ONLY A ROUGH CALC FOR TESTING THE UI for t in tests_summary.data: try: t._id = "-".join([ coalesce(t.build.product, ""), t.build.platform, coalesce(t.build.type, ""), coalesce(t.run.type, ""), t.run.suite, t.test, unicode(today.unix) ]) except Exception, e: Log.error("text join problem", cause=e) t.timestamp = today t.average = t.fails.avg if t.date.var == 0: t.slope = 0 else: t.slope = (t.date_fails.avg - t.date.avg * t.fails.avg) / t.date.var t.etl.timestamp = Date.now() # PUSH STATS TO ES docs = [{"id": t._id, "value": t} for t in tests_summary.data if t.fails.sum > 0] Log.note("Adding {{num}} test summaries", num=len(docs)) destination.extend(docs)
def groupby(self, keys): return jx.groupby(self.__iter__(), keys)
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop): for not_summarized in todo: if please_stop: return True # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION? Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name) dups = http.post_json(settings.url, json={ "from": "coverage", "select": [ {"name": "max_id", "value": "etl.source.id", "aggregate": "max"}, {"name": "min_id", "value": "etl.source.id", "aggregate": "min"} ], "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url" ], "limit": 100000, "format": "list" }) dups_found = False for d in dups.data: if d.max_id != d.min_id: dups_found = True Log.note( "removing dups {{details|json}}\n{{dups|json|indent}}", details={ "id": int(d.max_id), "test": d.test.url, "source": not_summarized.source.file.name, "revision": not_summarized.build.revision12 } ) # FIND ALL INDEXES all_indexes = [ p.index for p in coverage_index.cluster.get_aliases() if p.alias == coverage_index.settings.alias ] for index_name in all_indexes: elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [ {"not": {"term": {"etl.source.id": int(d.max_id)}}}, {"term": {"test.url": d.test.url}}, {"term": {"source.file.name": not_summarized.source.file.name}}, {"term": {"build.revision12": not_summarized.build.revision12}} ]}) if dups_found: continue # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED test_count = http.post_json(settings.url, json={ "from": "coverage.source.file.covered", "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url", "line" ], "limit": 100000, "format": "list" }) all_tests_covering_file = UNION(test_count.data.get("test.url")) num_tests = len(all_tests_covering_file) max_siblings = num_tests - 1 Log.note( "{{filename}} rev {{revision}} is covered by {{num}} tests", filename=not_summarized.source.file.name, num=num_tests, revision=not_summarized.build.revision12 ) line_summary = list( (k, unwrap(wrap(list(v)).get("test.url"))) for k, v in jx.groupby(test_count.data, keys="line") ) # PULL THE RAW RECORD FOR MODIFICATION file_level_coverage_records = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"in": {"test.url": all_tests_covering_file}}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }} ]}, "limit": 100000, "format": "list" }) for test_name in all_tests_covering_file: siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names] min_siblings = MIN(siblings) coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name) if coverage_candidates: if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates): Log.warning( "Duplicate coverage\n{{cov|json|indent}}", cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates] ) # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE for coverage_record in coverage_candidates: coverage_record.source.file.max_test_siblings = max_siblings coverage_record.source.file.min_line_siblings = min_siblings coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1) else: example = http.post_json(settings.url, json={ "from": "coverage", "where": {"eq": { "test.url": test_name, "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, "limit": 1, "format": "list" }) Log.warning( "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}", test=test_name, file=not_summarized.source.file.name, revision=not_summarized.build.revision12, example=example.data[0] ) bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None] if bad_example: Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0]) rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data] coverage_summary_index.extend(rows) coverage_index.extend(rows) all_test_summary = [] for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"): cov = UNION(records.source.file.covered) uncov = UNION(records.source.file.uncovered) coverage = { "_id": "|".join([records[0].build.revision12, g["source.file.name"]]), # SOMETHING UNIQUE, IN CASE WE RECALCULATE "source": { "file": { "name": g["source.file.name"], "is_file": True, "covered": jx.sort(cov, "line"), "uncovered": jx.sort(uncov), "total_covered": len(cov), "total_uncovered": len(uncov), "min_line_siblings": 0 # PLACEHOLDER TO INDICATE DONE } }, "build": records[0].build, "repo": records[0].repo, "run": records[0].run, "etl": {"timestamp": Date.now()} } all_test_summary.append(coverage) sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary] coverage_summary_index.extend(sum_rows) if DEBUG: coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"eq": {"source.file.name": not_summarized.source.file.name}}, {"eq": {"build.revision12": not_summarized.build.revision12}} ]}, "format": "list", "limit": 10 }) if todo.data: Log.error("Failure to update")
locals()[k] = v for k, f in fixes.items(): try: _source[k] = eval(f) except Exception, e: if "Problem pulling pushlog" in e: pass elif "can not find branch" in e: pass else: Log.warning("not evaluated {{expression}}", expression=f, cause=e) return _source for g, docs in jx.groupby(pending_ids, max_size=BATCH_SIZE): with Timer("Replicate {{num_docs}} documents", {"num_docs": len(docs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"terms": {"_id": set(docs)}} }}, "from": 0, "size": 200000, "sort": [] }) destination.extend([{"id": h._id, "value": fixer(h._source)} for h in data.hits.hits]) if please_stop: break
if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(b) except Exception, e: Log.error("Can not execute sql:\n{{sql}}", sql= sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql= sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception, e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1) ## Insert dictionary of values into table def insert(self, table_name, record): keys = record.keys()
def _get_job_results_from_th(self, branch, revision): output = [] with self.locker: waiting_threads = self.pending.get((branch, revision)) if waiting_threads is None: sig = None waiting_threads = self.pending[(branch, revision)] = [output] else: sig = Signal() waiting_threads.append(Signal()) if sig is not None: Log.note("Holding thread for {{branch}}/{{revision}}", branch=branch, revision=revision) sig.wait_for_go() return waiting_threads[0] try: results = DictList() while True: response = self._rate_limited_get_json(expand_template(RESULT_SET_URL, {"branch": branch, "revision": revision[0:12:]})) results.extend(response.results) if len(response.results) != 1000: break for g, repo_ids in jx.groupby(results.id, size=10): jobs = DictList() with Timer("Get {{num}} jobs", {"num": len(repo_ids)}, debug=DEBUG): while True: response = self._rate_limited_get_json(expand_template(JOBS_URL, {"branch": branch, "offset": len(jobs), "result_set_id": ",".join(map(unicode, repo_ids))})) jobs.extend(response.results) if len(response.results) != 2000: break with Timer("Get (up to {{num}}) details from TH", {"num": len(jobs)}, debug=DEBUG): details = [] for _, ids in jx.groupby(jobs.id, size=40): details.extend(self._rate_limited_get_json( url=expand_template(DETAILS_URL, {"branch": branch, "job_id": ",".join(map(unicode, ids))}), retry={"times": 3} ).results) details = {k.job_guid: list(v) for k, v in jx.groupby(details, "job_guid")} with Timer("Get (up to {{num}}) stars from TH", {"num": len(jobs)}, debug=DEBUG): stars = [] for _, ids in jx.groupby(jobs.id, size=40): response = self._rate_limited_get_json(expand_template(JOB_BUG_MAP, {"branch": branch, "job_id": "&job_id=".join(map(unicode, ids))})) stars.extend(response), stars = {k.job_id: list(v) for k, v in jx.groupby(stars, "job_id")} with Timer("Get notes from TH", debug=DEBUG): notes = [] for jid in set([j.id for j in jobs if j.failure_classification_id != 1] + stars.keys()): response = self._rate_limited_get_json(expand_template(NOTES_URL, {"branch": branch, "job_id": unicode(jid)})) notes.extend(response), notes = {k.job_id: list(v) for k, v in jx.groupby(notes, "job_id")} for j in jobs: output.append(self._normalize_job_result(branch, revision, j, details, notes, stars)) if output: with Timer("Write to ES cache", debug=DEBUG): self.cache.extend({"id": "-".join([c.repo.branch, unicode(c.job.id)]), "value": c} for c in output) try: self.cache.flush() except Exception, e: Log.warning("problem flushing. nevermind.", cause=e) finally: with self.locker: for p in waiting_threads[1:]: if DEBUG: Log.note("releasing thread for {{branch}}/{{revision}}", branch=branch, revision=revision) p.go() self.pending[(branch, revision)] = None return output
def _load_all_in_push(self, revision, locale=None): # http://hg.mozilla.org/mozilla-central/json-pushes?full=1&changeset=57c461500a0c found_revision = copy(revision) if isinstance(found_revision.branch, basestring): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.error("can not find branch ({{branch}}, {{locale}})", name=lower_name, locale=locale) if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH: self.branches = _hg_branches.get_branches(use_cache=True, settings=self.settings) url = found_revision.branch.url.rstrip("/") + "/json-pushes?full=1&changeset=" + found_revision.changeset.id Log.note( "Reading pushlog for revision ({{branch}}, {{locale}}, {{changeset}}): {{url}}", branch=found_revision.branch.name, locale=locale, changeset=found_revision.changeset.id, url=url, ) try: data = self._get_and_retry(url, found_revision.branch) revs = [] output = None for index, _push in data.items(): push = Push(id=int(index), date=_push.date, user=_push.user) for _, ids in jx.groupby(_push.changesets.node, size=200): url_param = "&".join("node=" + c[0:12] for c in ids) url = found_revision.branch.url.rstrip("/") + "/json-info?" + url_param Log.note("Reading details from {{url}}", {"url": url}) raw_revs = self._get_and_retry(url, found_revision.branch) for r in raw_revs.values(): rev = Revision( branch=found_revision.branch, index=r.rev, changeset=Changeset( id=r.node, id12=r.node[0:12], author=r.user, description=r.description, date=Date(r.date), files=r.files, ), parents=unwraplist(r.parents), children=unwraplist(r.children), push=push, etl={"timestamp": Date.now().unix}, ) if r.node == found_revision.changeset.id: output = rev if r.node[0:12] == found_revision.changeset.id[0:12]: output = rev _id = ( coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE) ) revs.append({"id": _id, "value": rev}) self.es.extend(revs) return output except Exception, e: Log.error("Problem pulling pushlog from {{url}}", url=url, cause=e)
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "es_column": c.es_column}} }) Log.warning("Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
# https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(b) except Exception, e: Log.error("Can not execute sql:\n{{sql}}", sql=sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception, e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1) ## Insert dictionary of values into table
def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) return if c.table == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": { "eq": { "table": c.table, "name": c.name } } }) return es_index = c.table.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { c.name: _counting_query(c) }, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None) if cardinality == None: Log.error("logic error") query = Dict(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": { "path": listwrap(c.nested_path)[0] }, "aggs": { "_nested": { "terms": { "field": c.es_column, "size": 0 } } } } else: query.aggs[literal_field(c.name)] = { "terms": { "field": c.es_column, "size": 0 } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) except Exception, e: if "IndexMissingException" in e and c.table.startswith( TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": { "eq": { "table": c.table, "es_column": c.es_column } } }) Log.warning( "Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
def _get_job_results_from_th(self, branch, revision): output = [] with self.locker: waiting_threads = self.pending.get((branch, revision)) if waiting_threads is None: sig = None waiting_threads = self.pending[(branch, revision)] = [output] else: sig = Signal() waiting_threads.append(Signal()) if sig is not None: Log.note("Holding thread for {{branch}}/{{revision}}", branch=branch, revision=revision) sig.wait_for_go() return waiting_threads[0] try: results = DictList() while True: response = self._rate_limited_get_json( expand_template(RESULT_SET_URL, { "branch": branch, "revision": revision[0:12:] })) results.extend(response.results) if len(response.results) != 1000: break for g, repo_ids in jx.groupby(results.id, size=10): jobs = DictList() with Timer("Get {{num}} jobs", {"num": len(repo_ids)}, debug=DEBUG): while True: response = self._rate_limited_get_json( expand_template( JOBS_URL, { "branch": branch, "offset": len(jobs), "result_set_id": ",".join(map(unicode, repo_ids)) })) jobs.extend(response.results) if len(response.results) != 2000: break with Timer("Get (up to {{num}}) details from TH", {"num": len(jobs)}, debug=DEBUG): details = [] for _, ids in jx.groupby(jobs.id, size=40): details.extend( self._rate_limited_get_json(url=expand_template( DETAILS_URL, { "branch": branch, "job_id": ",".join(map(unicode, ids)) }), retry={ "times": 3 }).results) details = { k.job_guid: list(v) for k, v in jx.groupby(details, "job_guid") } with Timer("Get (up to {{num}}) stars from TH", {"num": len(jobs)}, debug=DEBUG): stars = [] for _, ids in jx.groupby(jobs.id, size=40): response = self._rate_limited_get_json( expand_template( JOB_BUG_MAP, { "branch": branch, "job_id": "&job_id=".join(map( unicode, ids)) })) stars.extend(response), stars = { k.job_id: list(v) for k, v in jx.groupby(stars, "job_id") } with Timer("Get notes from TH", debug=DEBUG): notes = [] for jid in set([ j.id for j in jobs if j.failure_classification_id != 1 ] + stars.keys()): response = self._rate_limited_get_json( expand_template(NOTES_URL, { "branch": branch, "job_id": unicode(jid) })) notes.extend(response), notes = { k.job_id: list(v) for k, v in jx.groupby(notes, "job_id") } for j in jobs: output.append( self._normalize_job_result(branch, revision, j, details, notes, stars)) if output: with Timer("Write to ES cache", debug=DEBUG): self.cache.extend( { "id": "-".join([c.repo.branch, unicode(c.job.id)]), "value": c } for c in output) try: self.cache.flush() except Exception, e: Log.warning("problem flushing. nevermind.", cause=e) finally: with self.locker: for p in waiting_threads[1:]: if DEBUG: Log.note( "releasing thread for {{branch}}/{{revision}}", branch=branch, revision=revision) p.go() self.pending[(branch, revision)] = None return output