def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = convert.json2value(row.json), None timestamp = Date(self.rollover_field(wrap(row).value)) if timestamp == None or timestamp < Date.today() - self.rollover_max: return Null rounded_timestamp = timestamp.floor(self.rollover_interval) queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": self.cluster.get_aliases(), "where": { "regex": { "index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d" } }, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, settings=self.settings) else: try: es = self.cluster.create_index( create_timestamp=rounded_timestamp, settings=self.settings) es.add_alias(self.settings.index) except Exception, e: if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = elasticsearch.Index(read_only=False, alias=best.alias, index=best.index, settings=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 10, timeout=5) self._delete_old_indexes(candidates) queue = self.known_queues[ rounded_timestamp.unix] = es.threaded_queue( max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
def get_all_in_es(es): in_es = set() all_indexes = es.es.cluster.get_metadata().indices for name, index in all_indexes.items(): if "unittest" not in index.aliases: continue result = elasticsearch.Index(index=name, alias="unittest", settings=es.es.settings).search({ "aggs": { "_match": { "terms": { "field": "etl.source.source.id", "size": 200000 } } } }) good_es = [] for k in result.aggregations._match.buckets.key: try: good_es.append(int(k)) except Exception, e: pass Log.note("got {{num}} from {{index}}", num=len(good_es), index=name) in_es |= set(good_es)
def test_save_then_load(self): test = { "data": [{ "a": "b" }], "query": { "meta": { "save": True }, "from": TEST_TABLE, "select": "a" }, "expecting_list": { "meta": { "format": "list" }, "data": ["b"] } } settings = self.utils.fill_container(test) bytes = unicode2utf8( value2json({ "from": settings.index, "select": "a", "format": "list" })) expected_hash = convert.bytes2base64( hashlib.sha1(bytes).digest()[0:6]).replace("/", "_") wrap(test).expecting_list.meta.saved_as = expected_hash self.utils.send_queries(test) # ENSURE THE QUERY HAS BEEN INDEXED Log.note("Flush saved query (with hash {{hash}})", hash=expected_hash) container = elasticsearch.Index(index="saved_queries", type=save_query.DATA_TYPE, kwargs=settings) container.flush(forced=True) with Timer("wait for 5 seconds"): Till(seconds=5).wait() url = URL(self.utils.testing.query) response = self.utils.try_till_response(url.scheme + "://" + url.host + ":" + text_type(url.port) + "/find/" + expected_hash, data=b'') self.assertEqual(response.status_code, 200) self.assertEqual(response.all_content, bytes)
def test_save_then_load(self): test = { "data": [ {"a": "b"} ], "query": { "meta": {"save": True}, "from": TEST_TABLE, "select": "a" }, "expecting_list": { "meta": { "format": "list" }, "data": ["b"] } } settings = self.utils.fill_container(test) bytes = convert.unicode2utf8(convert.value2json({ "from": settings.index, "select": "a", "format": "list" })) expected_hash = convert.bytes2base64(hashlib.sha1(bytes).digest()[0:6]).replace("/", "_") wrap(test).expecting_list.meta.saved_as = expected_hash self.utils.send_queries(test) #ENSURE THE QUERY HAS BEEN INDEXED container = elasticsearch.Index(index="saved_queries", settings=settings) container.flush() Thread.sleep(seconds=5) url = URL(self.utils.service_url) response = self.utils.try_till_response(url.scheme+"://"+url.host+":"+unicode(url.port)+"/find/"+expected_hash, data=b'') self.assertEqual(response.status_code, 200) self.assertEqual(response.all_content, bytes)
def main(): try: config = startup.read_settings(defs=[{ "name": ["--file"], "help": "file to save backup", "type": str, "dest": "file", "required": True }]) constants.set(config.constants) Log.start(config.debug) sq = elasticsearch.Index(settings=config.saved_queries) result = sq.search({"query": {"match_all": {}}, "size": 200000}) File(config.args.file).write("".join( map(convert.json2value, result.hits.hits))) except Exception, e: Log.error("Problem with etl", e)
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop): for not_summarized in todo: if please_stop: return True # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION? Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name) dups = http.post_json(settings.url, json={ "from": "coverage", "select": [ {"name": "max_id", "value": "etl.source.id", "aggregate": "max"}, {"name": "min_id", "value": "etl.source.id", "aggregate": "min"} ], "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url" ], "limit": 100000, "format": "list" }) dups_found = False for d in dups.data: if d.max_id != d.min_id: dups_found = True Log.note( "removing dups {{details|json}}\n{{dups|json|indent}}", details={ "id": int(d.max_id), "test": d.test.url, "source": not_summarized.source.file.name, "revision": not_summarized.build.revision12 } ) # FIND ALL INDEXES all_indexes = [ p.index for p in coverage_index.cluster.get_aliases() if p.alias == coverage_index.settings.alias ] for index_name in all_indexes: elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [ {"not": {"term": {"etl.source.id": int(d.max_id)}}}, {"term": {"test.url": d.test.url}}, {"term": {"source.file.name": not_summarized.source.file.name}}, {"term": {"build.revision12": not_summarized.build.revision12}} ]}) if dups_found: continue # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED test_count = http.post_json(settings.url, json={ "from": "coverage.source.file.covered", "where": {"and": [ {"missing": "source.method.name"}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, ]}, "groupby": [ "test.url", "line" ], "limit": 100000, "format": "list" }) all_tests_covering_file = UNION(test_count.data.get("test.url")) num_tests = len(all_tests_covering_file) max_siblings = num_tests - 1 Log.note( "{{filename}} rev {{revision}} is covered by {{num}} tests", filename=not_summarized.source.file.name, num=num_tests, revision=not_summarized.build.revision12 ) line_summary = list( (k, unwrap(wrap(list(v)).get("test.url"))) for k, v in jx.groupby(test_count.data, keys="line") ) # PULL THE RAW RECORD FOR MODIFICATION file_level_coverage_records = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"in": {"test.url": all_tests_covering_file}}, {"eq": { "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }} ]}, "limit": 100000, "format": "list" }) for test_name in all_tests_covering_file: siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names] min_siblings = MIN(siblings) coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name) if coverage_candidates: if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates): Log.warning( "Duplicate coverage\n{{cov|json|indent}}", cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates] ) # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE for coverage_record in coverage_candidates: coverage_record.source.file.max_test_siblings = max_siblings coverage_record.source.file.min_line_siblings = min_siblings coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1) else: example = http.post_json(settings.url, json={ "from": "coverage", "where": {"eq": { "test.url": test_name, "source.file.name": not_summarized.source.file.name, "build.revision12": not_summarized.build.revision12 }}, "limit": 1, "format": "list" }) Log.warning( "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}", test=test_name, file=not_summarized.source.file.name, revision=not_summarized.build.revision12, example=example.data[0] ) bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None] if bad_example: Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0]) rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data] coverage_summary_index.extend(rows) coverage_index.extend(rows) all_test_summary = [] for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"): cov = UNION(records.source.file.covered) uncov = UNION(records.source.file.uncovered) coverage = { "_id": "|".join([records[0].build.revision12, g["source.file.name"]]), # SOMETHING UNIQUE, IN CASE WE RECALCULATE "source": { "file": { "name": g["source.file.name"], "is_file": True, "covered": jx.sort(cov, "line"), "uncovered": jx.sort(uncov), "total_covered": len(cov), "total_uncovered": len(uncov), "min_line_siblings": 0 # PLACEHOLDER TO INDICATE DONE } }, "build": records[0].build, "repo": records[0].repo, "run": records[0].run, "etl": {"timestamp": Date.now()} } all_test_summary.append(coverage) sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary] coverage_summary_index.extend(sum_rows) if DEBUG: coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"eq": {"source.file.name": not_summarized.source.file.name}}, {"eq": {"build.revision12": not_summarized.build.revision12}} ]}, "format": "list", "limit": 10 }) if todo.data: Log.error("Failure to update")
def loop(source, coverage_summary_index, settings, please_stop): try: cluster = elasticsearch.Cluster(source) aliases = cluster.get_aliases() candidates = [] for pairs in aliases: if pairs.alias == source.index: candidates.append(pairs.index) candidates = jx.sort(candidates, {".": "desc"}) for index_name in candidates: coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source) push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT) while not please_stop: # IDENTIFY NEW WORK Log.note("Working on index {{index}}", index=index_name) coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "groupby": ["source.file.name", "build.revision12"], "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"gte": {"repo.push.date": push_date_filter}} ]}, "format": "list", "limit": coalesce(settings.batch_size, 100) }) if not todo.data: break queue = Queue("pending source files to review") queue.extend(todo.data[0:coalesce(settings.batch_size, 100):]) threads = [ Thread.run( "processor" + unicode(i), process_batch, queue, coverage_index, coverage_summary_index, settings, please_stop=please_stop ) for i in range(NUM_THREAD) ] # ADD STOP MESSAGE queue.add(Thread.STOP) # WAIT FOR THEM TO COMPLETE for t in threads: t.join() please_stop.go() return except Exception, e: Log.warning("Problem processing", cause=e)