def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File(settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto(settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File( settings.param.first_run_time).exists and not File( settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es( settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl( settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string( db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write( unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def main(settings, es=None, es_comments=None): if not settings.param.allow_private_bugs and es and not es_comments: Log.error("Must have ES for comments") resume_from_last_run = File(settings.param.first_run_time).exists and not File(settings.param.last_run_time).exists #MAKE HANDLES TO CONTAINERS try: with DB(settings.bugzilla, readonly=True) as db: current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments) with ThreadedQueue(es, size=500, silent=True) as output_queue: #SETUP RUN PARAMETERS param = Struct() param.end_time = CNV.datetime2milli(get_current_time(db)) # DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts)) # THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE. # THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK param.start_time_str = extract_bugzilla.milli2string(db, param.start_time) param.alias_file = settings.param.alias_file param.allow_private_bugs = settings.param.allow_private_bugs if last_run_time > 0: with Timer("run incremental etl"): incremental_etl(settings, param, db, es, es_comments, output_queue) else: with Timer("run full etl"): full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue) output_queue.add(Thread.STOP) if settings.es.alias: es.delete_all_but(settings.es.alias, settings.es.index) es.add_alias(settings.es.alias) if settings.es_comments.alias: es.delete_all_but(settings.es_comments.alias, settings.es_comments.index) es_comments.add_alias(settings.es_comments.alias) File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time))) except Exception, e: Log.error("Problem with main ETL loop", e)
def setup_es(settings, db, es, es_comments): """ SETUP ES CONNECTIONS TO REFLECT IF WE ARE RESUMING, INCREMENTAL, OR STARTING OVER """ current_run_time = get_current_time(db) if File(settings.param.first_run_time).exists and File( settings.param.last_run_time).exists: # INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX last_run_time = long(File(settings.param.last_run_time).read()) if not es: es = ElasticSearch(settings.es) es_comments = ElasticSearch(settings.es_comments) elif File(settings.param.first_run_time).exists: # DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL try: last_run_time = 0 current_run_time = long(File(settings.param.first_run_time).read()) if not es: if not settings.es.alias: temp = ElasticSearch(settings.es).get_proto( settings.es.index) settings.es.alias = settings.es.index settings.es.index = temp.last() es = ElasticSearch(settings.es) es.set_refresh_interval( 1 ) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY if not settings.es_comments.alias: temp = ElasticSearch(settings.es_comments).get_proto( settings.es_comments.index) settings.es_comments.alias = settings.es_comments.index settings.es_comments.index = temp.last() es_comments = ElasticSearch(settings.es_comments) except Exception, e: Log.warning("can not resume ETL, restarting", e) File(settings.param.first_run_time).delete() return setup_es(settings, db, es, es_comments)