def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto(settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def open_test_instance(name, settings): if settings.filename: Log.note("Using {{filename}} as {{type}}", { "filename": settings.filename, "type": name }) return Fake_ES(settings) else: Log.note("Using ES cluster at {{host}} as {{type}}", { "host": settings.host, "type": name }) ElasticSearch.delete_index(settings) schema = CNV.JSON2object(File(settings.schema_file).read(), flexible=True, paths=True) es = ElasticSearch.create_index(settings, schema, limit_replicas=True) return es
def random_sample_of_bugs(self): """ I USE THIS TO FIND BUGS THAT CAUSE MY CODE PROBLEMS. OF COURSE, IT ONLY WORKS WHEN I HAVE A REFERENCE TO COMPARE TO """ NUM_TO_TEST = 100 MAX_BUG_ID = 900000 with DB(self.settings.bugzilla) as db: candidate = elasticsearch.make_test_instance( "candidate", self.settings.candidate) reference = ElasticSearch(self.settings.private_bugs_reference) #GO FASTER BY STORING LOCAL FILE local_cache = File(self.settings.param.temp_dir + "/private_bugs.json") if local_cache.exists: private_bugs = set(CNV.JSON2object(local_cache.read())) else: with Timer("get private bugs"): private_bugs = compare_es.get_private_bugs(reference) local_cache.write(CNV.object2JSON(private_bugs)) while True: some_bugs = [ b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs ] Log.note("Test with the following bug_ids: {{bugs}}", {"bugs": some_bugs}) #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) param.start_time = 0 param.start_time_str = extract_bugzilla.milli2string(db, 0) param.alias_file = self.settings.param.alias_file try: with ThreadedQueue(candidate, 100) as output: etl(db, output, param, please_stop=None) #COMPARE ALL BUGS Thread.sleep(2) # MUST SLEEP WHILE ES DOES ITS INDEXING found_errors = compare_both(candidate, reference, self.settings, some_bugs) if found_errors: Log.note("Errors found") break else: pass except Exception, e: Log.warning( "Total failure during compare of bugs {{bugs}}", {"bugs": some_bugs}, e)
def test_replication(): try: settings = startup.read_settings(filename="replication_settings.json") Log.start(settings.debug) source = ElasticSearch(settings.source) destination = replicate.get_or_create_index(settings["destination"], source) replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d")) finally: Log.stop()
def main(settings): #USE A FILE if settings.source.filename != None: settings.destination.alias = settings.destination.index settings.destination.index = ElasticSearch.proto_name(settings.destination.alias) schema = CNV.JSON2object(File(settings.source.schema_filename).read()) if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments.")) dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) # GET LAST UPDATED time_file = File(settings.param.last_replication_time) from_file = None if time_file.exists: from_file = CNV.milli2datetime(CNV.value2int(time_file.read())) from_es = get_last_updated(destination) last_updated = nvl(MIN(from_file, from_es), CNV.milli2datetime(0)) current_time = datetime.utcnow() pending = get_pending(source, last_updated) with ThreadedQueue(destination, size=1000) as data_sink: replicate(source, data_sink, pending, last_updated) # RECORD LAST UPDATED time_file.write(unicode(CNV.datetime2milli(current_time)))
def get_or_create_index(destination_settings, source): #CHECK IF INDEX, OR ALIAS, EXISTS es = ElasticSearch(destination_settings) aliases = es.get_aliases() indexes = [a for a in aliases if a.alias == destination_settings.index] if not indexes: #CREATE INDEX schema = source.get_schema() assert schema.settings assert schema.mappings ElasticSearch.create_index(destination_settings, schema, limit_replicas=True) elif len(indexes) > 1: Log.error("do not know how to replicate to more than one index") elif indexes[0].alias != None: destination_settings.alias = destination_settings.index destination_settings.index = indexes[0].index return ElasticSearch(destination_settings)
class TestLookForLeaks(unittest.TestCase): def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings def tearDown(self): Log.stop() def blocks_of_bugs(self): max_bug_id = self.private.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [{"match_all": {}}]} }}, "from": 0, "size": 0, "sort": [], "facets": {"0": {"statistical": {"field": "bug_id"}}} }).facets["0"].max return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment))) def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, {"and": [ {"match_all": {}}, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"exists": {"field": "bug_group"}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE ]} ]}, ["bug_id", "bug_group", "modified_ts"] ) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", { "num": len(private_ids.keys()) }) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"terms": {"bug_id": private_ids.keys()}}, {"range": {"expires_on": {"gte": NOW}}} # SOME BUGS WILL LEAK FOR A LITTLE WHILE ]} ) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from":leaked_bugs, "select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}], "sort":"bug_id" }) }) for b in leaked_bugs: Log.note("{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get( self.public_comments, {"terms": {"bug_id": private_ids.keys()}}, limit=20 ) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms":{"bug_id":leaked_comments.bug_id}} ) Log.warning("{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!") def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, #CURRENT RECORDS {"range": {"modified_ts": {"lt": A_WHILE_AGO}}}, #OF A MINIMUM AGE {"nested": { #HAS ATTACHMENT. "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"exists": {"field":"attachments.attach_id"}} }} }}, {"or":[ {"nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"term": {"attachments.isprivate": 1}} }} }}, {"exists":{"field":"bug_group"}} # ...PRIVATE BUG ]} ]}, fields=["bug_id", "bug_group", "attachments", "modified_ts"] ) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": {"or": [ {"exists": "bug_group"}, {"terms": {"attachments.isprivate": ['1', True, 1]}} ]} }) Log.note("Ensure {{num}} attachments did not leak", { "num": len(private_attachments) }) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, {"and": [ {"range": {"bug_id": {"gte": min_id, "lt": max_id}}}, {"range": {"expires_on": {"gte": NOW}}}, # CURRENT BUGS {"nested": { "path": "attachments", "query": {"filtered": { "query": {"match_all": {}}, "filter": {"terms": {"attach_id": private_attachments}} }} }} ]} # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms":{"bug_id":leaked_bugs.bug_id}} ) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note("{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")
Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema=CNV.JSON2object(schema, paths=True) schema.settings=jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema=CNV.JSON2object(comment_schema, paths=True) comment_schema.settings=jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True) File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time))) return current_run_time, es, es_comments, last_run_time
def normalize(bug, old_school=False): bug=bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags=Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k,v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v=CNV.value2int(v) new_k=k[12:] a[k.replace(".", "\.")]=new_v if not old_school: a[new_k]=new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.") bug.changes=CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f]=CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e)
Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments) else: # START ETL FROM BEGINNING, MAKE NEW INDEX last_run_time = 0 if not es: # BUG VERSIONS schema = File(settings.es.schema_file).read() if transform_bugzilla.USE_ATTACHMENTS_DOT: schema = schema.replace("attachments_", "attachments\\.") schema = CNV.JSON2object(schema, paths=True) schema.settings = jsons.expand_dot(schema.settings) if not settings.es.alias: settings.es.alias = settings.es.index settings.es.index = ElasticSearch.proto_name(settings.es.alias) es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True) # BUG COMMENTS comment_schema = File(settings.es_comments.schema_file).read() comment_schema = CNV.JSON2object(comment_schema, paths=True) comment_schema.settings = jsons.expand_dot(comment_schema.settings) if not settings.es_comments.alias: settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = ElasticSearch.proto_name( settings.es_comments.alias) es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True)
def normalize(bug, old_school=False): bug = bug.copy() bug.id = unicode(bug.bug_id) + "_" + unicode(bug.modified_ts)[:-3] bug._id = None #ENSURE STRUCTURES ARE SORTED # Do some processing to make sure that diffing between runs stays as similar as possible. bug.flags = Q.sort(bug.flags, "value") if bug.attachments: if USE_ATTACHMENTS_DOT: bug.attachments = CNV.JSON2object( CNV.object2JSON(bug.attachments).replace( "attachments_", "attachments.")) bug.attachments = Q.sort(bug.attachments, "attach_id") for a in bug.attachments: for k, v in list(a.items()): if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")): new_v = CNV.value2int(v) new_k = k[12:] a[k.replace(".", "\.")] = new_v if not old_school: a[new_k] = new_v a.flags = Q.sort(a.flags, ["modified_ts", "value"]) if bug.changes != None: if USE_ATTACHMENTS_DOT: json = CNV.object2JSON(bug.changes).replace( "attachments_", "attachments.") bug.changes = CNV.JSON2object(json) bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"]) #bug IS CONVERTED TO A 'CLEAN' COPY bug = ElasticSearch.scrub(bug) # bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST for f in NUMERIC_FIELDS: v = bug[f] if v == None: continue elif f in MULTI_FIELDS: bug[f] = CNV.value2intlist(v) elif CNV.value2number(v) == 0: del bug[f] else: bug[f] = CNV.value2number(v) # Also reformat some date fields for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]: v = bug[dateField] if v == None: continue try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error( "problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e)
elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v + "000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli( CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error( "problem with converting date to milli (value={{value}})", {"value": bug[dateField]}, e) bug.votes = None bug.exists = True return ElasticSearch.scrub(bug)
class TestLookForLeaks(unittest.TestCase): def setUp(self): settings = startup.read_settings(filename="leak_check_settings.json") Log.start(settings.debug) self.private = ElasticSearch(settings.private) self.public = ElasticSearch(settings.public) self.public_comments = ElasticSearch(settings.public_comments) self.settings = settings def tearDown(self): Log.stop() def blocks_of_bugs(self): max_bug_id = self.private.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "match_all": {} }] } } }, "from": 0, "size": 0, "sort": [], "facets": { "0": { "statistical": { "field": "bug_id" } } } }).facets["0"].max return reversed( list(Q.intervals(0, max_bug_id, self.settings.param.increment))) def test_private_bugs_not_leaking(self): bad_news = False # FOR ALL BUG BLOCKS for min_id, max_id in self.blocks_of_bugs(): results = get( self.private, { "and": [ { "match_all": {} }, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "exists": { "field": "bug_group" } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE ] } ] }, ["bug_id", "bug_group", "modified_ts"]) private_ids = {b.bug_id: b.bug_group for b in results} Log.note("Ensure {{num}} bugs did not leak", {"num": len(private_ids.keys())}) # VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "terms": { "bug_id": private_ids.keys() } }, { "range": { "expires_on": { "gte": NOW } } } # SOME BUGS WILL LEAK FOR A LITTLE WHILE ] }) if leaked_bugs: bad_news = True if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note( "{{num}} leaks!! {{bugs}}", { "num": len(leaked_bugs), "bugs": Q.run({ "from": leaked_bugs, "select": [ "bug_id", "bug_version_num", { "name": "modified_ts", "value": lambda d: CNV.datetime2string( CNV.milli2datetime(d.modified_ts)) } ], "sort": "bug_id" }) }) for b in leaked_bugs: Log.note( "{{bug_id}} has bug groups {{bug_group}}\n{{version|indent}}", { "bug_id": b.bug_id, "bug_group": private_ids[b.bug_id], "version": milli2datetime(b) }) #CHECK FOR LEAKED COMMENTS, BEYOND THE ONES LEAKED BY BUG leaked_comments = get(self.public_comments, {"terms": { "bug_id": private_ids.keys() }}, limit=20) if leaked_comments: bad_news = True if self.settings.param.delete: self.public_comments.delete_record( {"terms": { "bug_id": leaked_comments.bug_id }}) Log.warning( "{{num}} comments marked private have leaked!\n{{comments|indent}}", { "num": len(leaked_comments), "comments": leaked_comments }) if bad_news: Log.error("Bugs have leaked!") def test_private_attachments_not_leaking(self): for min_id, max_id in self.blocks_of_bugs(): # FIND ALL PRIVATE ATTACHMENTS bugs_w_private_attachments = get( self.private, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, #CURRENT RECORDS { "range": { "modified_ts": { "lt": A_WHILE_AGO } } }, #OF A MINIMUM AGE { "nested": { #HAS ATTACHMENT. "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "exists": { "field": "attachments.attach_id" } } } } } }, { "or": [ { "nested": { #PRIVATE ATTACHMENT, OR... "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "term": { "attachments.isprivate": 1 } } } } } }, { "exists": { "field": "bug_group" } } # ...PRIVATE BUG ] } ] }, fields=["bug_id", "bug_group", "attachments", "modified_ts"]) private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) try: private_attachments = [int(v) for v in private_attachments] except Exception, e: private_attachments = Q.run({ "from": bugs_w_private_attachments, "select": "attachments.attach_id", "where": { "or": [{ "exists": "bug_group" }, { "terms": { "attachments.isprivate": ['1', True, 1] } }] } }) Log.note("Ensure {{num}} attachments did not leak", {"num": len(private_attachments)}) #VERIFY NONE IN PUBLIC leaked_bugs = get( self.public, { "and": [ { "range": { "bug_id": { "gte": min_id, "lt": max_id } } }, { "range": { "expires_on": { "gte": NOW } } }, # CURRENT BUGS { "nested": { "path": "attachments", "query": { "filtered": { "query": { "match_all": {} }, "filter": { "terms": { "attach_id": private_attachments } } } } } } ] } # fields=["bug_id", "attachments"] ) # if leaked_bugs: if self.settings.param.delete: self.public.delete_record( {"terms": { "bug_id": leaked_bugs.bug_id }}) Log.note("{{num}} bugs with private attachments have leaked!", {"num": len(leaked_bugs)}) for b in leaked_bugs: Log.note( "{{bug_id}} has private_attachment\n{{version|indent}}", { "bug_id": b.bug_id, "version": b }) Log.error("Attachments have leaked!")
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File( settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto( settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval( 1 ) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto( settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
try: if isinstance(v, date): bug[dateField] = CNV.datetime2milli(v) elif isinstance(v, long) and len(unicode(v)) in [12, 13]: bug[dateField] = v elif not isinstance(v, basestring): Log.error("situation not handled") elif DATE_PATTERN_STRICT.match(v): # Convert to "2012/01/01 00:00:00.000" # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f")) elif DATE_PATTERN_STRICT_SHORT.match(v): # Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp. # Example: bug 856732 (cf_last_resolved) # dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z" bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S")) elif DATE_PATTERN_RELAXED.match(v): # Convert "2012/01/01 00:00:00.000" to "2012-01-01" # Example: bug 643420 (deadline) # bug 726635 (cf_due_date) bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d")) except Exception, e: Log.error("problem with converting date to milli (value={{value}})", {"value":bug[dateField]}, e) bug.votes = None bug.exists = True return ElasticSearch.scrub(bug)