def update_usage_stats(app, job_id): if not job_id: return False print "Updating usage_stats" try: cmd = [ "/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time()) ] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, err = proc.communicate() except Exception as e: print "Failed to run system_stats.sh" print "Caught exception : {0}".format(e) return cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) old = record.get("usage_stats", "") current = old + out.strip('\n') st = update_record(record, "usage_stats", current) return
def retract_job(jobid): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid) print record['username'] if record['username'] == session["username"]: print "Username matches" else: return template( './views/error.tpl', error_str= "You are not the owner of this job :{0} \nInsufficient permissions to retract job" .format(jobid), session=session) record["i_ispublished"] = '0' record.save(overwrite=True) return template("./views/retract_confirm.tpl", job_id=jobid, title="Retract Confirmation", session=session)
def update_job_for_publish(request, job_id): print "Updating job for publish" record = dutils.dynamodb_get(request.app.config["dyno.conn"], job_id) record["i_ispublished"] = '1' record["jobname"] = request.POST.get('jobname').strip() record["description"] = request.POST.get('jobdesc').strip() record["publishdate"] = str(time.strftime('%Y-%m-%d %H:%M:%S')) record.save(overwrite=True) return True
def cancel_task(app, jobid): debug_print("Cancelling task : {0}".format(jobid)) record = dutils.dynamodb_get(app.config["dyno.conn"], jobid) tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S')) update_record(record, "status", "cancelled") update_record(record, "reason", "User request cancel") update_record(record, "cancel_time", tstamp) debug_print ("{0} - {1} - {2}".format(record["job_id"], record["status"], record["reason"])) return True
def cancel_task(app, jobid): debug_print("Cancelling task : {0}".format(jobid)) record = dutils.dynamodb_get(app.config["dyno.conn"], jobid) tstamp = str(time.strftime("%Y-%m-%d %H:%M:%S")) update_record(record, "status", "cancelled") update_record(record, "reason", "User request cancel") update_record(record, "cancel_time", tstamp) debug_print("{0} - {1} - {2}".format(record["job_id"], record["status"], record["reason"])) return True
def status_task(app, jobid): debug_print("Status task : {0}".format(jobid)) record = dutils.dynamodb_get(app.config["dyno.conn"], jobid) status = {} if GLOBAL_VERBOSE: for item in record.items(): print "|{0:10} | {1:50}".format(item[0], item[1]) print record["status"] return record["status"]
def check_if_cancelled(app, job_id): if not job_id : return False print "Statecheck" cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) if record["status"] == "cancelled": print "Cancelled" return True print "Job not cancelled" return False
def check_if_cancelled(app, job_id): if not job_id: return False print "Statecheck" cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) if record["status"] == "cancelled": print "Cancelled" return True print "Job not cancelled" return False
def update_usage_stats(app, job_id): if not job_id : return False print "Updating usage_stats" try: cmd = ["/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time())] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, err = proc.communicate() except Exception as e: print "Failed to run system_stats.sh" print "Caught exception : {0}".format(e) return cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) old = record.get("usage_stats", "") current = old + out.strip('\n') st = update_record(record, "usage_stats", current) return
def retract_job(jobid): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid) print record['username'] if record['username'] == session["username"]: print "Username matches" else: return template('./views/error.tpl', error_str="You are not the owner of this job :{0} \nInsufficient permissions to retract job".format(jobid), session=session) record["i_ispublished"] = '0' record.save(overwrite=True) return template("./views/retract_confirm.tpl", job_id=jobid, title="Retract Confirmation", session=session)
def watch_loop(app): """ Watch_loop looks at the definition of the autoscaling_groups and the active queues to determine whether : 1. An instance needs to be removed from the scaling group and terminated 2. A task has been in the active queue for long and appears to have timed out and needs to be moved to the pending queue, for re-attempt. Why would a task fail ? -> Hard error in task causes worker to fail -> Instance was lost mid run """ status = conf_man.update_creds_from_metadata_server(app) stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"] autoscale = get_autoscale_info(app, stack_name) print autoscale # Select all relevant queues in our cloudformation stack queues = [ q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name) ] # Select only the active queues active_q = [q for q in queues if "Active" in q.name] pending_q = [q for q in queues if "Active" not in q.name] for q in active_q: print "Active queue : ", q.name qtype = None if "Test" in q.name: qtype = "test" elif "Prod" in q.name: qtype = "prod" else: logging.error("Unknown queue : ".format(q.name)) break # Find the corresponding pending queue to the current active queue p_q = None p_qs = [pq for pq in pending_q if qtype in pq.name.lower()] if len(p_qs) == 1: p_q = p_qs[0] print "Pending queue : {0}".format(p_q) else: logging.error("Found too many pending queues : {0}".format(p_qs)) exit(0) print "Instances in this group : ", autoscale[qtype]["instances"] for i in autoscale[qtype]["instances"]: print i.health_status print i.lifecycle_state while (1): """ Here we get all messages in the current queue and check the following conditions: 1. No more messages to check -> Break 2. If messages exists -> Check if it is a kill_request. -> Kill the instance and decrement the autoscale group desired count -> """ messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All']) if not messages: break for msg in messages: # Check if message is a kill_request if msg.message_attributes["job_id"][ "string_value"] == "kill_request": logging.info("Received a kill_request from : ".format( msg.message_attributes["instance_id"]["string_value"])) # Are there more machines than the minimum if autoscale[qtype]["current"] > autoscale[qtype]["min"]: logging.info( "Instances in autoscale group current:{0} > min:{1}" .format(autoscale[qtype]["current"], autoscale[qtype]["min"])) logging.info("Kill : {0}".format( msg.message_attributes["instance_id"] ["string_value"])) kill_instance( app, msg.message_attributes["instance_id"] ["string_value"], autoscale[qtype]) q.delete_message(msg) # Message is a regular job else: # We do not have excess machines. So no kill requests need to be made. # However the message needs to be deleted logging.info("Kill request ignored: {0}".format( msg.message_attributes["instance_id"] ["string_value"])) q.delete_message(msg) # If message is not a kill_request it is an active job. # Check if job has timed-out or the machine it is running on has # accidentally terminated else: job_id = msg.message_attributes["job_id"]["string_value"] instance_id = msg.message_attributes["instance_id"][ "string_value"] try: record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) except Exception, e: logging.debug( "JOb {0} not found in dynamodb \nDeleting the message" ) q.delete_message(msg) record = None if record and record["status"] in ["completed", "failed"]: logging.debug( "Job {0} is {1} -> Deleting the active job message" .format(job_id, record["status"])) q.delete_message(msg) else: logging.debug( "Job_id: {0} Active on Instance: {1}".format( job_id, instance_id)) check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
def watch_loop(app): """ Watch_loop looks at the definition of the autoscaling_groups and the active queues to determine whether : 1. An instance needs to be removed from the scaling group and terminated 2. A task has been in the active queue for long and appears to have timed out and needs to be moved to the pending queue, for re-attempt. Why would a task fail ? -> Hard error in task causes worker to fail -> Instance was lost mid run """ status = conf_man.update_creds_from_metadata_server(app) stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"] autoscale = get_autoscale_info(app, stack_name) print autoscale # Select all relevant queues in our cloudformation stack queues = [q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name)] # Select only the active queues active_q = [q for q in queues if "Active" in q.name] pending_q = [q for q in queues if "Active" not in q.name] for q in active_q: print "Active queue : ", q.name qtype = None if "Test" in q.name: qtype = "test" elif "Prod" in q.name: qtype = "prod" else: logging.error("Unknown queue : ".format(q.name)) break # Find the corresponding pending queue to the current active queue p_q = None p_qs = [pq for pq in pending_q if qtype in pq.name.lower()] if len(p_qs) == 1: p_q = p_qs[0] print "Pending queue : {0}".format(p_q) else: logging.error("Found too many pending queues : {0}".format(p_qs)) exit(0) print "Instances in this group : ", autoscale[qtype]["instances"] for i in autoscale[qtype]["instances"]: print i.health_status print i.lifecycle_state while (1): """ Here we get all messages in the current queue and check the following conditions: 1. No more messages to check -> Break 2. If messages exists -> Check if it is a kill_request. -> Kill the instance and decrement the autoscale group desired count -> """ messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All']) if not messages: break for msg in messages: # Check if message is a kill_request if msg.message_attributes["job_id"]["string_value"] == "kill_request": logging.info("Received a kill_request from : ".format(msg.message_attributes["instance_id"]["string_value"])) # Are there more machines than the minimum if autoscale[qtype]["current"] > autoscale[qtype]["min"]: logging.info("Instances in autoscale group current:{0} > min:{1}".format(autoscale[qtype]["current"], autoscale[qtype]["min"])) logging.info("Kill : {0}".format(msg.message_attributes["instance_id"]["string_value"])) kill_instance(app, msg.message_attributes["instance_id"]["string_value"], autoscale[qtype]) q.delete_message(msg) # Message is a regular job else: # We do not have excess machines. So no kill requests need to be made. # However the message needs to be deleted logging.info("Kill request ignored: {0}".format(msg.message_attributes["instance_id"]["string_value"])) q.delete_message(msg) # If message is not a kill_request it is an active job. # Check if job has timed-out or the machine it is running on has # accidentally terminated else: job_id = msg.message_attributes["job_id"]["string_value"] instance_id = msg.message_attributes["instance_id"]["string_value"] try: record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) except Exception, e: logging.debug("JOb {0} not found in dynamodb \nDeleting the message") q.delete_message(msg) record = None if record and record["status"] in ["completed", "failed"]: logging.debug("Job {0} is {1} -> Deleting the active job message".format(job_id, record["status"])) q.delete_message(msg) else: logging.debug("Job_id: {0} Active on Instance: {1}".format(job_id, instance_id)) check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth): # Save current folder and chdir to a temporary folder conf_man.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) ############################################################################## # Notify job execution start time ############################################################################## update_record(record, "start_time", time.time()) ############################################################################## # Setup dirs for execution ############################################################################## cwd = os.getcwd() tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id) try: os.makedirs(tmpdir) except: print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir) shutil.rmtree(tmpdir) os.makedirs(tmpdir) os.chdir(tmpdir) ############################################################################## # Download the inputs to the temp folder ############################################################################## update_record(record, "status", "staging_inputs") stagein_start = time.time() try: get_inputs(app, inputs, auth) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to download inputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to download inputs") return False stagein_total = time.time() - stagein_start ############################################################################## # Download the inputs to the temp folder ############################################################################## # Check if job is valid update_record(record, "status", "processing") if jobtype not in apps.JOBS: logging.error("Jobtype : {0} does not exist".format(jobtype)) print "Unable to process jobtype : {0}".format(jobtype) return False print "JOBS : ", apps.JOBS[jobtype] status = True returncode = 0 process_start = time.time() try: returncode = apps.JOBS[jobtype](app, data) print "Returncode : {0}".format(returncode) conf_man.update_creds_from_metadata_server(app) except Exception as e: update_record(record, "status", "Failed") update_record(record, "complete_time", time.time()) update_record(record, "ERROR", str(e)) print "Job execution failed : {0}".format(e) status = False process_total = time.time() - process_start ############################################################################## # Upload the results to the S3 ############################################################################## record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) update_record(record, "status", "staging_outputs") stageout_start = time.time() # Upload the result to S3 try: put_outputs(app, outputs) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to upload outputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to upload inputs") return False stageout_total = time.time() - stageout_start update_record(record, "z_stagein_dur", stagein_total) update_record(record, "z_stageout_dur", stageout_total) update_record(record, "z_processing_dur", process_total - 1) if returncode != 0: update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) update_record(record, "ERROR_CODE", returncode) status = False else: update_record(record, "status", "completed") update_record(record, "complete_time", time.time()) if clean_tmp_dirs: shutil.rmtree(tmpdir) # Chdir back to the original folder os.chdir(cwd) return True
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth): # Save current folder and chdir to a temporary folder conf_man.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) ############################################################################## # Notify job execution start time ############################################################################## update_record(record, "start_time", time.time()) ############################################################################## # Setup dirs for execution ############################################################################## cwd = os.getcwd() tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id) try: os.makedirs(tmpdir) except: print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir) shutil.rmtree(tmpdir) os.makedirs(tmpdir) os.chdir(tmpdir) ############################################################################## # Download the inputs to the temp folder ############################################################################## update_record(record, "status", "staging_inputs") stagein_start = time.time() try: get_inputs(app, inputs, auth) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to download inputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to download inputs") return False stagein_total = time.time() - stagein_start ############################################################################## # Download the inputs to the temp folder ############################################################################## # Check if job is valid update_record(record, "status", "processing") if jobtype not in apps.JOBS: logging.error("Jobtype : {0} does not exist".format(jobtype)) print "Unable to process jobtype : {0}".format(jobtype) return False print "JOBS : ", apps.JOBS[jobtype] status = True returncode = 0 process_start = time.time() try: returncode = apps.JOBS[jobtype](app, data) print "Returncode : {0}".format(returncode) conf_man.update_creds_from_metadata_server(app) except Exception as e: update_record(record, "status", "Failed"); update_record(record, "complete_time", time.time()) update_record(record, "ERROR", str(e)); print "Job execution failed : {0}".format(e) status = False process_total = time.time() - process_start ############################################################################## # Upload the results to the S3 ############################################################################## record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) update_record(record, "status", "staging_outputs") stageout_start = time.time() # Upload the result to S3 try: put_outputs(app, outputs) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to upload outputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error( "Failed to upload inputs") return False stageout_total = time.time() - stageout_start update_record(record, "z_stagein_dur", stagein_total) update_record(record, "z_stageout_dur", stageout_total) update_record(record, "z_processing_dur", process_total - 1) if returncode != 0 : update_record(record, "status", "failed"); update_record(record, "complete_time", time.time()) update_record(record, "ERROR_CODE", returncode); status = False else: update_record(record, "status", "completed") update_record(record, "complete_time", time.time()) if clean_tmp_dirs: shutil.rmtree(tmpdir) # Chdir back to the original folder os.chdir(cwd) return True