def run_both_etl(db, output_queue, es_comments, param): comment_thread = Thread.run("etl comments", etl_comments, db, es_comments, param) process_thread = Thread.run("etl", etl, db, output_queue, param) result = comment_thread.join() if result.exception: Log.error("etl_comments had problems", result.exception) result = process_thread.join() if result.exception: Log.error("etl had problems", result.exception)
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}): if param.allow_private_bugs: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select(db.query(""" SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl(**{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error("Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)
b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND m.bug_id IS NULL """, { "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: return with Thread.run("alias analysis", alias_analysis.main, settings=settings, bug_list=bug_list): Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", { "num": len(bug_list), "bug_list": bug_list }) param.bug_list = bug_list run_both_etl(**{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings):
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue): with Thread.run("alias_analysis", alias_analysis.main, settings=settings): end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id) start = nvl(settings.param.start, 0) if resume_from_last_run: start = nvl( settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment)) ############################################################# ## MAIN ETL LOOP ############################################################# #TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD # with Multithread([run_both_etl, run_both_etl]) as workers: for min, max in Q.intervals(start, end, settings.param.increment): if settings.args.quick and min < end - settings.param.increment and min != 0: #--quick ONLY DOES FIRST AND LAST BLOCKS continue try: #GET LIST OF CHANGED BUGS with Timer("time to get {{min}}..{{max}} bug list", { "min": min, "max": max }): if param.allow_private_bugs: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") else: bug_list = Q.select( db.query( """ SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND ({{min}} <= b.bug_id AND b.bug_id < {{max}}) AND m.bug_id IS NULL """, { "min": min, "max": max, "start_time_str": param.start_time_str }), u"bug_id") if not bug_list: continue param.bug_list = bug_list run_both_etl( **{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() }) except Exception, e: Log.error( "Problem with dispatch loop in range [{{min}}, {{max}})", { "min": min, "max": max }, e)
SELECT b.bug_id FROM bugs b LEFT JOIN bug_group_map m ON m.bug_id=b.bug_id WHERE delta_ts >= {{start_time_str}} AND m.bug_id IS NULL """, {"start_time_str": param.start_time_str}), u"bug_id") if not bug_list: return with Thread.run("alias analysis", alias_analysis.main, settings=settings, bug_list=bug_list): Log.note("Updating {{num}} bugs:\n{{bug_list|indent}}", { "num": len(bug_list), "bug_list": bug_list }) param.bug_list = bug_list run_both_etl( **{ "db": db, "output_queue": output_queue, "es_comments": es_comments, "param": param.copy() })