def test_extract_job(complex_job, extract_job_settings): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ with MySQL(extract_job_settings.source.database) as source: with MySqlSnowflakeExtractor(extract_job_settings.source) as extractor: sql = extractor.get_sql( SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = first(acc) doc.guid = first(JOB).guid # NEW EACH TIME job_guid = first(jx.drill(JOB, "job_log.failure_line.job_guid")) for fl in jx.drill(doc, "job_log.failure_line"): fl.job_guid = job_guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def test_extract_alert(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file, then you may use the diff to review the changes. """ now = datetime.datetime.now() source = MySQL(extract_alert_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_alert_settings.source) sql = extractor.get_sql( SQL("SELECT " + text(test_perf_alert_summary.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] # TEST ARE RUN WITH CURRENT TIMESTAMPS doc.created = now doc.last_updated = now for d in doc.details: d.created = now d.last_updated = now d.series_signature.last_updated = now assertAlmostEqual( acc, ALERT, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def add(self, aggs, acc, query, decoders, selects): self.result = format_table_from_groupby(aggs, acc, query, decoders, selects) # CONFIRM HEADER MATCH if self.header: assertAlmostEqual(self.header, self.result.header) else: self.header = self.result.header
def parse_sql(sql): query = wrap(moz_sql_parser.parse(sql)) # PULL OUT THE AGGREGATES for s in listwrap(query.select): val = s if s == "*" else s.value # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION for g in listwrap(query.groupby): try: assertAlmostEqual(g.value, val, "") g.name = s.name s.value = None # MARK FOR REMOVAL break except Exception as e: pass if isinstance(val, Mapping): for a in KNOWN_SQL_AGGREGATES: if val[a]: s.aggregate = a s.value = val[a] query.select = [ s for s in listwrap(query.select) if s == "*" or s.value != None ] query.format = "table" return query
def compare_to_expected(query, result, expect, places): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": try: assertAlmostEqual(set(result.header), set(expect.header)) except Exception as e: Log.error("format=table headers do not match", cause=e) # MAP FROM expected COLUMN TO result COLUMN mapping = transpose(*transpose(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) ))[1])[0] result.header = [result.header[m] for m in mapping] if result.data: columns = transpose(*unwrap(result.data)) result.data = transpose(*(columns[m] for m in mapping)) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if not query.sort: try: # result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if is_list(expect.data): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception as _: pass if is_list(result.data): try: result.data = jx.sort(result.data, sort_order.name) except Exception as _: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_header = map(literal_field, result_header) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_header = map(literal_field, expect_header) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=places)
def compare_to_expected(query, result, expect): query = wrap(query) expect = wrap(expect) if result.meta.format == "table": assertAlmostEqual(set(result.header), set(expect.header)) # MAP FROM expected COLUMN TO result COLUMN mapping = list(zip(*list(zip(*filter( lambda v: v[0][1] == v[1][1], itertools.product(enumerate(expect.header), enumerate(result.header)) )))[1]))[0] result.header = [result.header[m] for m in mapping] if result.data: columns = list(zip(*unwrap(result.data))) result.data = zip(*[columns[m] for m in mapping]) if not query.sort: sort_table(result) sort_table(expect) elif result.meta.format == "list": if query["from"].startswith("meta."): pass else: query = QueryOp.wrap(query, query.frum, query.schema) if not query.sort: try: #result.data MAY BE A LIST OF VALUES, NOT OBJECTS data_columns = jx.sort(set(jx.get_columns(result.data, leaves=True)) | set(jx.get_columns(expect.data, leaves=True)), "name") except Exception as _: data_columns = [{"name": "."}] sort_order = listwrap(coalesce(query.edges, query.groupby)) + data_columns if isinstance(expect.data, list): try: expect.data = jx.sort(expect.data, sort_order.name) except Exception: pass if isinstance(result.data, list): try: result.data = jx.sort(result.data, sort_order.name) except Exception: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=6)
def allclose(a, b): try: from mo_testing.fuzzytestcase import assertAlmostEqual assertAlmostEqual(a, b) return True except Exception as e: return False
def parse_sql(sql): # TODO: CONVERT tuple OF LITERALS INTO LITERAL LIST # # IF ALL MEMBERS OF A LIST ARE LITERALS, THEN MAKE THE LIST LITERAL # if all(isinstance(r, number_types) for r in output): # pass # elif all(isinstance(r, number_types) or (is_data(r) and "literal" in r.keys()) for r in output): # output = {"literal": [r['literal'] if is_data(r) else r for r in output]} query = wrap(moz_sql_parser.parse(sql)) redundant_select = [] # PULL OUT THE AGGREGATES for s in listwrap(query.select): val = s if s == '*' else s.value # EXTRACT KNOWN AGGREGATE FUNCTIONS if is_data(val): for a in KNOWN_SQL_AGGREGATES: value = val[a] if value != None: if is_list(value): # AGGREGATE WITH PARAMETERS EG percentile(value, 0.90) s.aggregate = a s[a] = unwraplist(value[1::]) s.value = value[0] else: # SIMPLE AGGREGATE s.aggregate = a s.value = value break # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION for g in listwrap(query.groupby): try: assertAlmostEqual(g.value, val, "") g.name = s.name redundant_select.append(s) break except Exception: pass # REMOVE THE REDUNDANT select if is_list(query.select): for r in redundant_select: query.select.remove(r) elif query.select and redundant_select: query.select = None # RENAME orderby TO sort query.sort, query.orderby = query.orderby, None query.format = "table" return query
def parse_sql(sql): query = wrap(moz_sql_parser.parse(sql)) query.select = listwrap(query.select) redundant_select = [] # PULL OUT THE AGGREGATES for s in query.select: val = s if s == '*' else s.value # EXTRACT KNOWN AGGREGATE FUNCTIONS if isinstance(val, Mapping): for a in KNOWN_SQL_AGGREGATES: value = val[a] if value != None: s.aggregate = a if isinstance(value, list): # AGGREGATE WITH PARAMETERS EG percentile(value, 0.90) s[a] = unwraplist(value[1::]) s.value = value[0] elif isinstance(value, Mapping): # EXPRESSION if len(value.keys()) == 0: s.value = None else: s.value = value else: # SIMPLE VALUE s.value = value break # LOOK FOR GROUPBY COLUMN IN SELECT CLAUSE, REMOVE DUPLICATION for g in listwrap(query.groupby): try: assertAlmostEqual(g.value, val, "") g.name = s.name redundant_select.append(s) break except Exception: pass # REMOVE THE REDUNDANT select for r in redundant_select: query.select.remove(r) # RENAME orderby TO sort query.sort, query.orderby = query.orderby, None query.format = "table" return query
def fix(rownum, line, source, sample_only_filter, sample_size): value = json2value(line) if value._id.startswith(("tc.97", "96", "bb.27")): # AUG 24, 25 2017 - included full diff with repo; too big to index try: data = json2value(line) repo = data.repo repo.etl = None repo.branch.last_used = None repo.branch.description = None repo.branch.etl = None repo.branch.parent_name = None repo.children = None repo.parents = None if repo.changeset.diff or data.build.repo.changeset.diff: Log.error("no diff allowed") else: assertAlmostEqual(minimize_repo(repo), repo) except Exception as e: if CAN_NOT_DECODE_JSON in e: raise e data.repo = minimize_repo(repo) data.build.repo = minimize_repo(data.build.repo) line = value2json(data) else: pass if rownum == 0: if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") row = {"value": value} return row, True elif len(line) > MAX_RECORD_LENGTH: _shorten(value, source) value = _fix(value) elif line.find('"resource_usage":') != -1: value = _fix(value) row = {"value": value} return row, False
def test_extract_job(complex_job, extract_job_settings, now): source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid doc.last_modified = complex_job.last_modified assertAlmostEqual( acc, JOB, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def __eq__(self, other): if other == None: if self._json == "null": return True else: return False elif self._json == "null": return False Log.warning("expensive") from mo_testing.fuzzytestcase import assertAlmostEqual try: assertAlmostEqual(json2value(self._json), other) return True except Exception: return False
def test_extract_alert(extract_alert_settings, test_perf_alert_summary, test_perf_alert): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_alerts.json file, then you may use the diff to review the changes. """ with MySQL(extract_alert_settings.source.database) as source: with MySqlSnowflakeExtractor( extract_alert_settings.source) as extractor: sql = extractor.get_sql( SQL("SELECT " + text(test_perf_alert_summary.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) assertAlmostEqual( acc, ALERT, places=3 ) # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603
def test_extract_job(complex_job, extract_job_settings, now): """ If you find this test failing, then copy the JSON in the test failure into the test_extract_job.json file, then you may use the diff to review the changes. """ source = MySQL(extract_job_settings.source.database) extractor = MySqlSnowflakeExtractor(extract_job_settings.source) sql = extractor.get_sql(SQL("SELECT " + text(complex_job.id) + " as id")) acc = [] with source.transaction(): cursor = list(source.query(sql, stream=True, row_tuples=True)) extractor.construct_docs(cursor, acc.append, False) doc = acc[0] doc.guid = complex_job.guid assertAlmostEqual( acc, JOB, places= 4, # TH MIXES LOCAL TIMEZONE WITH GMT: https://bugzilla.mozilla.org/show_bug.cgi?id=1612603 )
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(str(key + ".json.gz")) if VERIFY_UPLOAD: lines = list(lines) with mo_files.TempFile() as tempfile: with open(tempfile.abspath, "wb") as buff: DEBUG and Log.note("Temp file {{filename}}", filename=tempfile.abspath) archive = gzip.GzipFile(filename=str(key + ".json"), fileobj=buff, mode="w") count = 0 for l in lines: if is_many(l): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() retry = 3 while retry: try: with Timer( "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}", { "key": key, "file_length": tempfile.length, "count": count }, verbose=self.settings.debug, ): storage.set_contents_from_filename( tempfile.abspath, headers={"Content-Type": mimetype.GZIP}) break except Exception as e: e = Except.wrap(e) retry -= 1 if (retry == 0 or "Access Denied" in e or "No space left on device" in e): Log.error("could not push data to s3", cause=e) else: Log.warning("could not push data to s3, will retry", cause=e) if self.settings.public: storage.set_acl("public-read") if VERIFY_UPLOAD: try: with open(tempfile.abspath, mode="rb") as source: result = list(ibytes2ilines( scompressed2ibytes(source))) assertAlmostEqual(result, lines, msg="file is different") # full_url = "https://"+self.name+".s3-us-west-2.amazonaws.com/"+storage.key.replace(":", "%3A") # https://active-data-test-result.s3-us-west-2.amazonaws.com/tc.1524896%3A152488763.0.json.gz # dest_bucket = s3.MultiBucket(bucket="self.name", kwargs=self.settings.aws) result = list(self.read_lines(strip_extension(key))) assertAlmostEqual(result, lines, result, msg="S3 is different") except Exception as e: from activedata_etl.transforms import TRY_AGAIN_LATER Log.error(TRY_AGAIN_LATER, reason="did not pass verification", cause=e) return
try: result.data = jx.sort(result.data, sort_order.name) except Exception, _: pass elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort: result_data, result_header = cube2list(result.data) result_data = unwrap(jx.sort(result_data, result_header)) result.data = list2cube(result_data, result_header) expect_data, expect_header = cube2list(expect.data) expect_data = jx.sort(expect_data, expect_header) expect.data = list2cube(expect_data, expect_header) # CONFIRM MATCH assertAlmostEqual(result, expect, places=places) def cube2list(cube): """ RETURNS header SO THAT THE ORIGINAL CUBE CAN BE RECREATED :param cube: A dict WITH VALUES BEING A MULTIDIMENSIONAL ARRAY OF UNIFORM VALUES :return: (rows, header) TUPLE """ header = list(unwrap(cube).keys()) rows = [] for r in zip(*[[(k, v) for v in a] for k, a in cube.items()]): row = Data() for k, v in r: row[k]=v rows.append(unwrap(row))