def get_pending(es, since): result = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": { "range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}} }}, "from": 0, "size": 0, "sort": [], "facets": {"default": {"terms": {"field": "bug_id", "size": 200000}}} }) if len(result.facets.default.terms) >= 200000: Log.error("Can not handle more than 200K bugs changed") pending_bugs = Multiset( result.facets.default.terms, key_field="term", count_field="count" ) Log.note("Source has {{num}} bug versions for updating", { "num": len(pending_bugs) }) return pending_bugs
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", {"parameters": refresh_param}, e)
def extend(self, records): """ JUST SO WE MODEL A Queue """ records = {v["id"]: v["value"] for v in records} struct.unwrap(self.data).update(records) data_as_json = CNV.object2JSON(self.data, pretty=True) File(self.filename).write(data_as_json) Log.note("{{num}} items added", {"num": len(records)})
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) return ElasticSearch(settings)
def loadAliases(settings): try: try: with Timer("load alias file at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning("No alias file found (looking at {{filename}}", {"filename":nvl(settings.param.alias_file.path, settings.param.alias_file)}) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def analysis(settings, last_run, please_stop): DIFF = 7 if last_run: DIFF = 4 #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING try_again = True while try_again and not please_stop: #FIND EMAIL MOST NEEDING REPLACEMENT problem_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): #ONLY COUNT NEGATIVE EMAILS for email, count in agg.dic.iteritems(): if count < 0: problem_agg.add(alias(email), amount=count) problems = Q.sort([ {"email": e, "count": c} for e, c in problem_agg.dic.iteritems() if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run) ], ["count", "email"]) try_again = False for problem in problems: if please_stop: break #FIND MOST LIKELY MATCH solution_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): if agg.dic.get(problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem solution_agg += agg solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"]) if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count: #exact match pass elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count): #not distinctive enough continue best_solution = solutions[0] Log.note("{{problem}} ({{score}}) -> {{solution}} {{matches}}", { "problem": problem.email, "score": problem.count, "solution": best_solution.email, "matches": CNV.object2JSON(Q.select(solutions, "count")[:10:]) }) try_again = True add_alias(problem.email, best_solution.email) saveAliases(settings)
def etl_comments(db, es, param, please_stop): # CONNECTIONS ARE EXPENSIVE, CACHE HERE with comment_db_cache_lock: if not comment_db_cache: comment_db = DB(db) comment_db_cache.append(comment_db) with comment_db_cache_lock: Log.note("Read comments from database") comments = get_comments(comment_db_cache[0], param) for g, c in Q.groupby(comments, size=500): with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}): es.extend({"id": cc.comment_id, "value": cc} for cc in c)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", { "parameters": refresh_param }, e)
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning("Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) ElasticSearch.delete_index(settings) schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True) es = ElasticSearch.create_index(settings, schema, limit_replicas=True) return es
def add_alias(lost, found): found_record = aliases.get(found, None) lost_record = aliases.get(lost, None) new_canonical = found old_canonical = nvl(lost_record.canonical, lost) lost_record.canonical = new_canonical delete_list = [] #FOLD bugs ON lost=found for bug_id, agg in bugs.iteritems(): v = agg.dic.get(lost, 0) if v != 0: agg.add(lost, -v) agg.add(found, v) if not agg: delete_list.append(bug_id) #FOLD bugs ON old_canonical=new_canonical if old_canonical != lost: for bug_id, agg in bugs.iteritems(): v = agg.dic.get(old_canonical, 0) if v != 0: agg.add(old_canonical, -v) agg.add(new_canonical, v) if not agg: delete_list.append(bug_id) for d in delete_list: del bugs[d] #FOLD ALIASES for k, v in aliases.iteritems(): if v.canonical == old_canonical: Log.note( "ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", { "alias": k, "old": old_canonical, "new": found }) v.canonical = found
def add_alias(lost, found): found_record = aliases.get(found, None) lost_record = aliases.get(lost, None) new_canonical = found old_canonical = nvl(lost_record.canonical, lost) lost_record.canonical = new_canonical delete_list = [] #FOLD bugs ON lost=found for bug_id, agg in bugs.iteritems(): v = agg.dic.get(lost, 0) if v != 0: agg.add(lost, -v) agg.add(found, v) if not agg: delete_list.append(bug_id) #FOLD bugs ON old_canonical=new_canonical if old_canonical != lost: for bug_id, agg in bugs.iteritems(): v = agg.dic.get(old_canonical, 0) if v != 0: agg.add(old_canonical, -v) agg.add(new_canonical, v) if not agg: delete_list.append(bug_id) for d in delete_list: del bugs[d] #FOLD ALIASES for k, v in aliases.iteritems(): if v.canonical == old_canonical: Log.note("ALIAS REMAPPED: {{alias}}->{{old}} to {{alias}}->{{new}}", { "alias": k, "old": old_canonical, "new": found }) v.canonical = found
def test_private_bugs_do_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs)
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [{ "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def main(settings): file = File(settings.param.alias_file) aliases = CNV.JSON2object(file.read()) for v in aliases.values(): v.candidates = CNV.dict2Multiset(v.candidates) data = [ { "lost": n, "found": d.canonical } for n, d in aliases.items() if d.canonical != None and n != d.canonical ] sorted = Q.sort(data, "found") for s in sorted: Log.note("{{found}} == {{lost}}", s) clean = { n: d.canonical for n, d in aliases.items() if d.canonical != None and n != d.canonical and n != "" } rev_clean = struct.inverse(clean) Log.note(CNV.object2JSON(rev_clean, pretty=True)) for k, v in rev_clean.items(): if len(v) > 3: Log.note(CNV.object2JSON({k: v}, pretty=True))
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema}) no_schema=db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", { "start": s, "end": e }) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def loadAliases(settings): try: try: with Timer( "load alias file at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }): alias_json = File(settings.param.alias_file).read() except Exception, e: Log.warning( "No alias file found (looking at {{filename}}", { "filename": nvl(settings.param.alias_file.path, settings.param.alias_file) }) alias_json = "{}" #self.aliases IS A dict POINTING TO structs for k, v in CNV.JSON2object(alias_json).iteritems(): aliases[k] = struct.wrap(v) Log.note("{{num}} aliases loaded", {"num": len(aliases.keys())})
def main(settings, bug_list=None, please_stop=None, restart=False): """ THE CC LISTS (AND REVIEWS) ARE EMAIL ADDRESSES THE BELONG TO PEOPLE. SINCE THE EMAIL ADDRESS FOR A PERSON CAN CHANGE OVER TIME. THIS CODE WILL ASSOCIATE EACH PERSON WITH THE EMAIL ADDRESSES USED OVER THE LIFETIME OF THE BUGZILLA DATA. 'PERSON' IS ABSTRACT, AND SIMPLY ASSIGNED A CANONICAL EMAIL ADDRESS TO FACILITATE IDENTIFICATION """ if settings.args.quick: Log.note("Alias analysis skipped (--quick was used)") return if not restart: loadAliases(settings) if bug_list: with DB(settings.bugzilla, readonly=True) as db: data = get_all_cc_changes(db, bug_list) aggregator(data) analysis(settings, True, please_stop) return with DB(settings.bugzilla, readonly=True) as db: start = nvl(settings.param.start, 0) end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) #Perform analysis on blocks of bugs, in case we crash partway through for s, e in Q.intervals(start, end, settings.param.alias_increment): Log.note("Load range {{start}}-{{end}}", {"start": s, "end": e}) data = get_all_cc_changes(db, range(s, e)) if please_stop: break aggregator(data) analysis(settings, e >= end, please_stop)
def main(): """ MEANT TO BE RUN JUST ONCE IN DEVELOPMENT TO CONVERT A BIG PUBLIC DATABASE (8G+) INTO A TINY TESTING DB (FOR ADDING TO REPOSITORY) """ try: settings=startup.read_settings() Log.start(settings.debug) input=raw_input("We are going to totally wipe out the "+settings.bugzilla.schema.upper()+" schema at "+settings.bugzilla.host.upper()+"! Type \"YES\" to continue: ") if input!="YES": Log.note("Aborted. No Changes made.") return Log.note("Scrubbing db of those pesky records.") Log.note("This is going to take hours ...") DB.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", { "schema":settings.bugzilla.schema, "bug_list":SQL(settings.param.bugs) }) Log.note("... Done!") finally: Log.stop()
def make_test_instance(db_settings): if not db_settings.filename: Log.note("Database schema will not be touched") return with Timer("Make database instance"): try: #CLEAR SCHEMA Log.note("Make empty {{schema}} schema", {"schema": db_settings.schema}) no_schema = db_settings.copy() no_schema.schema = None with DB(no_schema) as db: db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema": db.quote_column(db_settings.schema)}) db.execute("CREATE DATABASE {{schema}}", {"schema": db.quote_column(db_settings.schema)}) #FILL SCHEMA Log.note("Fill {{schema}} schema with data", {"schema": db_settings.schema}) DB.execute_file(db_settings, db_settings.filename) #ADD MISSING TABLES with DB(db_settings) as db: db.execute(""" CREATE TABLE `longdescs_tags` ( `id` mediumint(9) NOT NULL AUTO_INCREMENT, `comment_id` int(11) DEFAULT NULL, `tag` varchar(24) NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `longdescs_tags_idx` (`comment_id`,`tag`), CONSTRAINT `fk_longdescs_tags_comment_id_longdescs_comment_id` FOREIGN KEY (`comment_id`) REFERENCES `longdescs` (`comment_id`) ON DELETE CASCADE ON UPDATE CASCADE ) DEFAULT CHARSET=utf8""") except Exception, e: Log.error("Can not setup test database", e)
removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON( old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json) Log.note("{{num}} of {{total}} aliases saved", { "num": len(compressed.keys()), "total": len(aliases.keys()) }) def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e) finally: Log.stop() if __name__ == "__main__":
def test_recent_private_stuff_does_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) #MARK SOME STUFF PRIVATE with DB(self.settings.bugzilla) as db: #BUGS private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs}) for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) #COMMENTS comments = db.query("SELECT comment_id FROM longdescs").comment_id marked_private_comments = Random.sample(comments, 5) for c in marked_private_comments: database.mark_comment_private(db, c, isprivate=1) #INCLUDE COMMENTS OF THE PRIVATE BUGS implied_private_comments = db.query( """ SELECT comment_id FROM longdescs WHERE {{where}} """, { "where": esfilter2sqlwhere(db, {"terms": { "bug_id": private_bugs }}) }).comment_id private_comments = marked_private_comments + implied_private_comments Log.note("The private comments are {{comments}}", {"comments": private_comments}) #ATTACHMENTS attachments = db.query("SELECT bug_id, attach_id FROM attachments") private_attachments = Random.sample(attachments, 5) Log.note("The private attachments are {{attachments}}", {"attachments": private_attachments}) for a in private_attachments: database.mark_attachment_private(db, a.attach_id, isprivate=1) if not File(self.settings.param.last_run_time).exists: Log.error("last_run_time should exist") bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, private_comments) #MARK SOME STUFF PUBLIC with DB(self.settings.bugzilla) as db: for b in private_bugs: database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING) bz_etl.main(self.settings, es, es_c) #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_public_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, marked_private_comments)
def test_recent_private_stuff_does_not_show(self): self.settings.param.allow_private_bugs = False File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() database.make_test_instance(self.settings.bugzilla) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) #MARK SOME STUFF PRIVATE with DB(self.settings.bugzilla) as db: #BUGS private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs}) for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) #COMMENTS comments = db.query("SELECT comment_id FROM longdescs").comment_id marked_private_comments = Random.sample(comments, 5) for c in marked_private_comments: database.mark_comment_private(db, c, isprivate=1) #INCLUDE COMMENTS OF THE PRIVATE BUGS implied_private_comments = db.query(""" SELECT comment_id FROM longdescs WHERE {{where}} """, { "where": esfilter2sqlwhere(db, {"terms":{"bug_id":private_bugs}}) }).comment_id private_comments = marked_private_comments + implied_private_comments Log.note("The private comments are {{comments}}", {"comments": private_comments}) #ATTACHMENTS attachments = db.query("SELECT bug_id, attach_id FROM attachments") private_attachments = Random.sample(attachments, 5) Log.note("The private attachments are {{attachments}}", {"attachments": private_attachments}) for a in private_attachments: database.mark_attachment_private(db, a.attach_id, isprivate=1) if not File(self.settings.param.last_run_time).exists: Log.error("last_run_time should exist") bz_etl.main(self.settings, es, es_c) Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_no_private_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, private_comments) #MARK SOME STUFF PUBLIC with DB(self.settings.bugzilla) as db: for b in private_bugs: database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING) bz_etl.main(self.settings, es, es_c) #VERIFY BUG IS PUBLIC, BUT PRIVATE ATTACHMENTS AND COMMENTS STILL NOT Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING verify_public_bugs(es, private_bugs) verify_no_private_attachments(es, private_attachments) verify_no_private_comments(es_c, marked_private_comments)
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": private_bugs}}, {"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}} ]} }}, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error("Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, {"and": [ {"match_all": {}}, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"exists": {"field": "bug_group"}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE ]} ]}, ["bug_id", "bug_group", "modified_ts"] ) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", { "num": len(private_ids.keys()) }) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"terms": {"bug_id": private_ids.keys()}}, {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE ]} ) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from":leaked_bugs, "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}], "sort":"bug_id" }) }) for b in leaked_bugs: Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get( self.public_comments, {"terms": {"bug_id": private_ids.keys()}}, limit=20 ) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")
added = set(compressed.keys()) - set(old_aliases.keys()) removed = set(old_aliases.keys()) - set(compressed.keys()) common = set(compressed.keys()) & set(old_aliases.keys()) changed = set() for c in common: if CNV.object2JSON(compressed[c], pretty=True) != CNV.object2JSON(old_aliases[c], pretty=True): changed.add(c) if added or removed or changed: alias_json = CNV.object2JSON(compressed, pretty=True) file = File(settings.param.alias_file) file.write(alias_json) Log.note("{{num}} of {{total}} aliases saved", { "num": len(compressed.keys()), "total": len(aliases.keys()) }) def start(): try: settings = startup.read_settings() Log.start(settings.debug) main(settings, restart=True) except Exception, e: Log.error("Can not start", e) finally: Log.stop() if __name__ == "__main__":
def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE {"nested": { #HAS ATTACHMENT. "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"exists": {"field":"attachments.attach_id"}} }} }}, {"or":[ {"nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"term": {"attachments.isprivate": 1}} }} }}, {"exists":{"field":"bug_group"}} # ...PRIVATE BUG ]} ]}, fields=["bug_id", "bug_group", "attachments", "modified_ts"] ) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) Log.note("Ensure {{num}} attachments did not leak", { "num": len(private_attachments) }) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS {"nested": { "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"terms": {"attach_id": private_attachments}} }} }} ]} # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")
def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, { "and": [ { "match_all": {} }, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "exists": { "field": "bug_group" } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] } ] }, ["bug_id", "bug_group", "modified_ts"]) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", {"num": len(private_ids.keys())}) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "terms": { "bug_id": private_ids.keys() } }, { "range": { "expires_on": { "gte": NOW } } } # SOME BUGS WILL LEAK FOR A LITTLE WHILE ] }) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note( "{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from": leaked_bugs, "select": [ "bug_id", "bug_version_num", { "name": "modified_ts", "value": lambda d: CNV.datetime2string( CNV.milli2datetime(d.modified_ts)) } ], "sort": "bug_id" }) }) for b in leaked_bugs: Log.note( "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get(self.public_comments, {"terms": { "bug_id": private_ids.keys() }}, limit=20) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.warning( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!")
LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND m.bug_id IS NULL """, {"start_time_str": param.start_time_str}), u"bug_id") if not bug_list: return with Thread.run("alias analysis", alias_analysis.main, settings=settings, bug_list=bug_list): Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", { "num": len(bug_list), "bug_list": bug_list }) param.bug_list = bug_list run_both_etl( **{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end,
def analysis(settings, last_run, please_stop): DIFF = 7 if last_run: DIFF = 4 #ONCE WE HAVE ALL THE DATA IN WE CAN BE LESS DISCRIMINATING try_again = True while try_again and not please_stop: #FIND EMAIL MOST NEEDING REPLACEMENT problem_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): #ONLY COUNT NEGATIVE EMAILS for email, count in agg.dic.iteritems(): if count < 0: problem_agg.add(alias(email), amount=count) problems = Q.sort([{ "email": e, "count": c } for e, c in problem_agg.dic.iteritems() if not aliases.get(e, Null).ignore and (c <= -(DIFF / 2) or last_run)], ["count", "email"]) try_again = False for problem in problems: if please_stop: break #FIND MOST LIKELY MATCH solution_agg = Multiset(allow_negative=True) for bug_id, agg in bugs.iteritems(): if agg.dic.get( problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem solution_agg += agg solutions = Q.sort([{ "email": e, "count": c } for e, c in solution_agg.dic.iteritems()], [{ "field": "count", "sort": -1 }, "email"]) if last_run and len(solutions) == 2 and solutions[ 0].count == -solutions[1].count: #exact match pass elif len(solutions) <= 1 or (solutions[1].count + DIFF >= solutions[0].count): #not distinctive enough continue best_solution = solutions[0] Log.note( "{{problem}} ({{score}}) -> {{solution}} {{matches}}", { "problem": problem.email, "score": problem.count, "solution": best_solution.email, "matches": CNV.object2JSON( Q.select(solutions, "count")[:10:]) }) try_again = True add_alias(problem.email, best_solution.email) saveAliases(settings)
bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND m.bug_id IS NULL """, { "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: return with Thread.run("alias analysis", alias_analysis.main, settings=settings, bug_list=bug_list): Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", { "num": len(bug_list), "bug_list": bug_list }) param.bug_list = bug_list run_both_etl(**{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run:
def test_changes_to_private_bugs_still_have_bug_group(self): self.settings.param.allow_private_bugs = True File(self.settings.param.first_run_time).delete() File(self.settings.param.last_run_time).delete() private_bugs = set(Random.sample(self.settings.param.bugs, 3)) Log.note("The private bugs for this test are {{bugs}}", {"bugs": private_bugs}) database.make_test_instance(self.settings.bugzilla) #MARK SOME BUGS PRIVATE with DB(self.settings.bugzilla) as db: for b in private_bugs: database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING) es = elasticsearch.make_test_instance("candidate", self.settings.real.bugs) es_c = elasticsearch.make_test_instance("candidate_comments", self.settings.real.comments) bz_etl.main(self.settings, es, es_c) # MAKE A CHANGE TO THE PRIVATE BUGS with DB(self.settings.bugzilla) as db: for b in private_bugs: old_bug = db.query( "SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0] new_bug = old_bug.copy() new_bug.bug_status = "NEW STATUS" diff(db, "bugs", old_bug, new_bug) #RUN INCREMENTAL bz_etl.main(self.settings, es, es_c) #VERIFY BUG GROUP STILL EXISTS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING now = datetime.utcnow() results = es.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": private_bugs } }, { "range": { "expires_on": { "gte": CNV.datetime2milli(now) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) latest_bugs = Q.select(results.hits.hits, "_source") latest_bugs_index = Q.unique_index( latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG for bug_id in private_bugs: if latest_bugs_index[bug_id] == None: Log.error("Expecting to find the private bug {{bug_id}}", {"bug_id": bug_id}) bug_group = latest_bugs_index[bug_id].bug_group if not bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a bug group", {"bug_id": bug_id}) if BUG_GROUP_FOR_TESTING not in bug_group: Log.error( "Expecting private bug ({{bug_id}}) to have a \"{{bug_group}}\" bug group", { "bug_id": bug_id, "bug_group": BUG_GROUP_FOR_TESTING })
def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE { "nested": { #HAS ATTACHMENT. "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "exists": { "field": "attachments.attach_id" } } } } } }, { "or": [ { "nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "term": { "attachments.isprivate": 1 } } } } } }, { "exists": { "field": "bug_group" } } # ...PRIVATE BUG ] } ] }, fields=["bug_id", "bug_group", "attachments", "modified_ts"]) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) Log.note("Ensure {{num}} attachments did not leak", {"num": len(private_attachments)}) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, # CURRENT BUGS { "nested": { "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "terms": { "attach_id": private_attachments } } } } } } ] } # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note( "{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")