Пример #1
0
def retract_job(jobid):

    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid)
    print record['username']

    if record['username'] == session["username"]:
        print "Username matches"
    else:
        return template(
            './views/error.tpl',
            error_str=
            "You are not the owner of this job :{0} \nInsufficient permissions to retract job"
            .format(jobid),
            session=session)

    record["i_ispublished"] = '0'
    record.save(overwrite=True)

    return template("./views/retract_confirm.tpl",
                    job_id=jobid,
                    title="Retract Confirmation",
                    session=session)
Пример #2
0
def published_jobs():

    print "Hi"
    session = bottle.request.environ.get('beaker.session')
    require_login(session)
    current_user = session["user_id"]

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(i_ispublished__eq='1')
    table_tpl = []

    for r in results:
        print r
        jobinfourl = request.app.get_url('/jobs')+"/"+str(r["job_id"])
        joburl     = ''
        if r["jobname"] :
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"])
        else:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"]))
                                                
        row = [joburl , str(r["description"]).replace('\r\n', '</br>'), 
               str(r["username"]), str(r["publishdate"])]
        
        table_tpl.append(row)

    table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S'), reverse=True)
    return template("./views/published_jobs.tpl",
                    title="Published Jobs",
                    table=table,
                    session=session)
Пример #3
0
def handle_login():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)
    access_token = request.params.get("access_token")
    expires_in = request.params.get("expires_in")
    aws_client_id = request.app.config["server.aws_client_id"]

    user_id, name, email = identity.get_identity_from_token(
        access_token, aws_client_id)
    user_info = identity.find_user_role(request.app, user_id)

    if not user_info:
        return template("./views/login_reject.tpl",
                        title="Turing - Login Rejected!",
                        username=name,
                        user_id=user_id,
                        email=email,
                        session=session)

    session["logged_in"] = True
    session["user_id"] = user_id
    session["username"] = name
    session["email"] = user_info["email"]  #email
    session["user_role"] = user_info["role"]

    print session
    return template("./views/login_confirm.tpl",
                    title="Turing - Login Success!",
                    session=session)
Пример #4
0
def update_usage_stats(app, job_id):
    if not job_id:
        return False

    print "Updating usage_stats"

    try:
        cmd = [
            "/home/ubuntu/task_engine/system_stats.sh",
            "{0}".format(time.time())
        ]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out, err = proc.communicate()
    except Exception as e:
        print "Failed to run system_stats.sh"
        print "Caught exception : {0}".format(e)
        return

    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    old = record.get("usage_stats", "")
    current = old + out.strip('\n')
    st = update_record(record, "usage_stats", current)
    return
Пример #5
0
def list_jobs():

    session = bottle.request.environ.get('beaker.session')
    require_login(session)
    current_user = session["user_id"]

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(i_user_id__eq=current_user)
    table_tpl = []

    print "Jobs: "
    print "-"*50
    for r in results:
        jobinfourl = request.app.get_url('/jobs')+"/"+str(r["job_id"])
        joburl     = ''
        if r["jobname"] :
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"])
        else:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"]))
                                                
        row = [joburl , str(r["status"]),  
               str(r["jobtype"]), str(r["submit_stamp"])]
        table_tpl.append(row)

    table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S'), reverse=True)
    return template("./views/jobs.tpl",
                    title="Task Status",
                    table=table,
                    session=session)
Пример #6
0
def list_jobs_rest():
    print "Rest Interface for list_task"
    session = bottle.request.environ.get('beaker.session')
    response.content_type = 'application/json'
    
    if request.POST.get("access_token"):
        print "Attempt to auth with access_token"
        user_info = validate_session(request.app, request.POST.get("access_token"))
        if not user_info :
            return {"status" : "Fail",
                    "reason" : "Failed to authenticate"}

        session.update(user_info)
        session["logged_in"] = True
        #print "Session : ",session
    else:        
        return {"status" : "Fail",
                "reason" : "access_token missing"}

    
    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(i_user_id__eq=session['user_id'])
    table_tpl = {}

    table_tpl['items'] = {}
    print "Jobs: "
    print "-"*50
    for i,r in enumerate(results):
        table_tpl['items'][i] = { "job_id"       : str(r["job_id"]),
                                  "status"       : str(r["status"]),
                                  "jobtype"      : str(r["jobtype"]),
                                  "submit_stamp" : str(r["submit_stamp"])}

    table_tpl['status'] = "Success"
    return table_tpl
Пример #7
0
def handle_login():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)
    access_token  = request.params.get("access_token")
    expires_in    = request.params.get("expires_in")
    aws_client_id = request.app.config["server.aws_client_id"]

    
    user_id, name, email = identity.get_identity_from_token(access_token, aws_client_id);
    user_info = identity.find_user_role(request.app, user_id)
    
    if not user_info :
        return template("./views/login_reject.tpl",
                        title="Turing - Login Rejected!",
                        username = name,
                        user_id  = user_id,                    
                        email    = email,
                        session  = session)

    
    session["logged_in"] = True
    session["user_id"]   = user_id
    session["username"]  = name
    session["email"]     = user_info["email"] #email
    session["user_role"] = user_info["role"]

    print session
    return template("./views/login_confirm.tpl",
                    title="Turing - Login Success!",
                    session=session)
Пример #8
0
def usage_stats():

    session = bottle.request.environ.get('beaker.session')
    require_login(session)
    current_user = session["user_id"]

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(status__in=['active', 'staging_inputs', 'processing', 'staging_output'])
    table_tpl = []

    print "Jobs: "
    print "-"*50
    for r in results:
        print r["username"]
        row = [str(r["username"]), str(r["job_id"]), str(r["status"]),  
               str(r["jobtype"]), str(r["submit_stamp"]), str(r["queue"])]
        table_tpl.append(row)

    stackname = request.app.config["instance.tags"]["aws:cloudformation:stack-name"]
    myautoscale = [x for x in request.app.config["scale.conn"].get_all_groups() if x.name.startswith(stackname)]
    
    autoscale = {}
    for grp in myautoscale:
        instances = grp.instances
        count     = len(instances)
        print grp.name
        print grp.name.strip("{0}-".format(stackname))
        print grp.name.strip("{0}-".format(stackname)).startswith('Test')
        
        grp_name = grp.name[len(stackname)+1:]
        
        if grp_name.startswith('Test'):
            
            autoscale['test'] = [grp.min_size*100/grp.max_size,
                                 (count-grp.min_size)*100/grp.max_size,
                                 (grp.desired_capacity-count)*100/grp.max_size,
                                 grp.max_size]
            
        elif grp_name.startswith('Prod'):
            autoscale['prod'] = [grp.min_size*100/grp.max_size,
                                 (count-grp.min_size)*100/grp.max_size,
                                 (grp.desired_capacity-count)*100/grp.max_size,
                                 grp.max_size]
            
        else:
            print "Error: could not find scaling groups"


    print autoscale


    table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S'), reverse=True)
    return template("./views/usage_stats.tpl",
                    title="Task Status",
                    table=table,
                    autoscale=autoscale,
                    session=session)
Пример #9
0
def submit_job_description():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    uid = _submit_task(request, session)

    return template("./views/submit_confirm.tpl",
                    job_id=uid,
                    title="Task Confirmation",
                    session=session)
Пример #10
0
def submit_job_description():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    uid = _submit_task(request, session)

    return template("./views/submit_confirm.tpl",
                    job_id=uid,
                    title="Task Confirmation",
                    session=session)
Пример #11
0
def publish_job_description():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)
    
    job_id = request.POST.get('jobid')
    status = update_job_for_publish(request, job_id)
    
    print job_id, status

    return template("./views/publish_confirm.tpl",
                    job_id=job_id,
                    title="Publish Confirmation",
                    session=session)
Пример #12
0
def publish_job_description():
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    job_id = request.POST.get('jobid')
    status = update_job_for_publish(request, job_id)

    print job_id, status

    return template("./views/publish_confirm.tpl",
                    job_id=job_id,
                    title="Publish Confirmation",
                    session=session)
Пример #13
0
def check_if_cancelled(app, job_id):
    if not job_id:
        return False

    print "Statecheck"
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    if record["status"] == "cancelled":
        print "Cancelled"
        return True

    print "Job not cancelled"
    return False
Пример #14
0
def check_if_cancelled(app, job_id):
    if not job_id :
        return False

    print "Statecheck"
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    if record["status"] == "cancelled":
        print "Cancelled"
        return True

    print "Job not cancelled"
    return False
Пример #15
0
def job_info(job_id):
    
    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)
    response.content_type = 'application/json'

    pairs = get_job_info(request, job_id)
    result = {}
    result['items'] = {}
    print "Pairs : ", pairs
    for i,p in enumerate(pairs):
        result['items'][i] = {p[0]:p[1]}
        if p[0] == "status":
            result['status'] = p[1]
    #print result
    return result
Пример #16
0
def watch_loop(app):
    
    cloudwatch = get_connection(app)
    while 1:
        status = conf_man.update_creds_from_metadata_server(app)
        if status :
            cloudwatch = get_connection(app)

        for q in app.config["sqs.conn"].get_all_queues():
            q_attr   = q.get_attributes()
            visible  = q_attr['ApproximateNumberOfMessages']
            inflight = q_attr['ApproximateNumberOfMessagesNotVisible']        
            total    = visible+inflight            
            r= cloudwatch.put_metric_data("SQS", 
                                          "ApproximateNumberOfTotalMessages", 
                                          value=total, 
                                          unit="Count",
                                          dimensions = {"QueueName" : q.name})
            logging.debug("[{0}] queue:{1} Total:{2} Visible:{3} Inflight:{4}".format(datetime.now().isoformat(),
                                                                                      q.name,
                                                                                      total,
                                                                                      visible,
                                                                                      inflight))
            print r
        time.sleep(60)
Пример #17
0
def job_info(job_id):

    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)
    response.content_type = 'application/json'

    pairs = get_job_info(request, job_id)
    result = {}
    result['items'] = {}
    print "Pairs : ", pairs
    for i, p in enumerate(pairs):
        result['items'][i] = {p[0]: p[1]}
        if p[0] == "status":
            result['status'] = p[1]
    #print result
    return result
Пример #18
0
def upload_to_s3():
    session = bottle.request.environ.get('beaker.session')
    require_login(session)

    conf_man.update_creds_from_metadata_server(request.app)
    job_id = str(uuid.uuid4())
    exp_time = tstamp_plus_nmins(60)
    bucket_name = "klab-jobs"  #"klab-webofscience" #
    print "Uploads page"

    vals = {
        "redirect_url":
        "{0}/{1}".format(request.app.config["server.url"], "upload_confirm"),
        "aws_key_id":
        request.app.config["instance.tags"]["S3UploadKeyId"],
        "job_id":
        job_id,
        "exp_date":
        exp_time,
        "bucket_name":
        bucket_name
    }

    print "Uploading with key : {0}".format(
        request.app.config["instance.tags"]["S3UploadKeyId"])

    policy, signature = get_signature_and_policy(request.app, vals)

    vals["policy"] = policy
    vals["signature"] = signature
    print "policy, signature : ", policy, signature
    return template('./views/upload.tpl',
                    name="",
                    email="",
                    username="",
                    redirect_url=vals["redirect_url"],
                    aws_key_id=vals["aws_key_id"],
                    exp_date=vals["exp_date"],
                    job_id=vals["job_id"],
                    bucket_name=vals["bucket_name"],
                    policy=policy,
                    signature=signature,
                    alert=False,
                    title="Upload data",
                    session=session)
Пример #19
0
def job_cancel(job_id):

    session = bottle.request.environ.get('beaker.session')
    require_login(session)

    conf_man.update_creds_from_metadata_server(request.app)
    dyntable = request.app.config['dyno.conn']

    try:
        tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S'))
        item = dyntable.get_item(job_id=job_id)
        item["status"] = "cancelled"
        item["reason"] = "User request cancel"
        item["cancel_time"] = tstamp
        dynamodb_update(dyntable, item)

    except ItemNotFound:
        return "The requested job_id was not found in the jobs database"

    redirect('/jobs/' + job_id)
Пример #20
0
def upload_to_s3():
    session = bottle.request.environ.get('beaker.session')
    require_login(session)

    conf_man.update_creds_from_metadata_server(request.app)
    job_id   = str(uuid.uuid4())
    exp_time = tstamp_plus_nmins(60)
    bucket_name =  "klab-jobs" #"klab-webofscience" #
    print "Uploads page"

    vals = { "redirect_url" : "{0}/{1}".format(request.app.config["server.url"],
                                               "upload_confirm"),
             "aws_key_id"   : request.app.config["instance.tags"]["S3UploadKeyId"],
             "job_id"       : job_id,
             "exp_date"     : exp_time,
             "bucket_name"  : bucket_name
         }

    print "Uploading with key : {0}".format(request.app.config["instance.tags"]["S3UploadKeyId"])

    policy, signature = get_signature_and_policy(request.app, vals)
    
    vals["policy"]    = policy
    vals["signature"] = signature
    print "policy, signature : ", policy, signature
    return template('./views/upload.tpl',
                    name            = "",
                    email           = "", 
                    username        = "",
                    redirect_url    = vals["redirect_url"],
                    aws_key_id      = vals["aws_key_id"],
                    exp_date        = vals["exp_date"],
                    job_id          = vals["job_id"],
                    bucket_name     = vals["bucket_name"],
                    policy          = policy,
                    signature       = signature,
                    alert=False,
                    title="Upload data",
                    session=session)
Пример #21
0
def retract_job(jobid):

    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid)
    print record['username']
    
    if record['username'] == session["username"]:
        print "Username matches"
    else:
        return template('./views/error.tpl',
                        error_str="You are not the owner of this job :{0} \nInsufficient permissions to retract job".format(jobid),
                        session=session)
    
    record["i_ispublished"]  = '0'
    record.save(overwrite=True)

    return template("./views/retract_confirm.tpl",
                    job_id=jobid,
                    title="Retract Confirmation",
                    session=session)
Пример #22
0
def job_cancel(job_id):
    
    session = bottle.request.environ.get('beaker.session')
    require_login(session)

    conf_man.update_creds_from_metadata_server(request.app)
    dyntable = request.app.config['dyno.conn']

    try:
        tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S'))
        item = dyntable.get_item(job_id=job_id)
        item["status"] = "cancelled"
        item["reason"] =  "User request cancel"
        item["cancel_time"] = tstamp
        dynamodb_update(dyntable, item)

    except boto.dynamodb2.exceptions.ItemNotFound:
        return template("./views/error.tpl",
                        session=session,
                        error_str="The requested job_id was not found in the jobs database")

    redirect('/jobs/' + job_id)
Пример #23
0
def update_usage_stats(app, job_id):
    if not job_id :
        return False

    print "Updating usage_stats"

    try:
        cmd = ["/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time())]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out, err = proc.communicate()
    except Exception as e:
        print "Failed to run system_stats.sh"
        print "Caught exception : {0}".format(e)
        return
    
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    old = record.get("usage_stats", "")
    current = old + out.strip('\n')
    st = update_record(record, "usage_stats", current)
    return
Пример #24
0
def published_jobs():

    print "Hi"
    session = bottle.request.environ.get('beaker.session')
    require_login(session)
    current_user = session["user_id"]

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(i_ispublished__eq='1')
    table_tpl = []

    for r in results:
        print r
        jobinfourl = request.app.get_url('/jobs') + "/" + str(r["job_id"])
        joburl = ''
        if r["jobname"]:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"])
        else:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl,
                                                    str(r["job_id"]))

        row = [
            joburl,
            str(r["description"]).replace('\r\n', '</br>'),
            str(r["username"]),
            str(r["publishdate"])
        ]

        table_tpl.append(row)

    table = sorted(table_tpl,
                   key=lambda row: datetime.datetime.strptime(
                       row[3], '%Y-%m-%d %H:%M:%S'),
                   reverse=True)
    return template("./views/published_jobs.tpl",
                    title="Published Jobs",
                    table=table,
                    session=session)
Пример #25
0
def list_jobs():

    session = bottle.request.environ.get('beaker.session')
    require_login(session)
    current_user = session["user_id"]

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(i_user_id__eq=current_user)
    table_tpl = []

    print "Jobs: "
    print "-" * 50
    for r in results:
        jobinfourl = request.app.get_url('/jobs') + "/" + str(r["job_id"])
        joburl = ''
        if r["jobname"]:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"])
        else:
            joburl = '<a href="{0}">{1}</a>'.format(jobinfourl,
                                                    str(r["job_id"]))

        row = [
            joburl,
            str(r["status"]),
            str(r["jobtype"]),
            str(r["submit_stamp"])
        ]
        table_tpl.append(row)

    table = sorted(table_tpl,
                   key=lambda row: datetime.datetime.strptime(
                       row[3], '%Y-%m-%d %H:%M:%S'),
                   reverse=True)
    return template("./views/jobs.tpl",
                    title="Task Status",
                    table=table,
                    session=session)
Пример #26
0
def list_jobs_rest():
    print "Rest Interface for list_task"
    session = bottle.request.environ.get('beaker.session')
    response.content_type = 'application/json'

    if request.POST.get("access_token"):
        print "Attempt to auth with access_token"
        user_info = validate_session(request.app,
                                     request.POST.get("access_token"))
        if not user_info:
            return {"status": "Fail", "reason": "Failed to authenticate"}

        session.update(user_info)
        session["logged_in"] = True
        #print "Session : ",session
    else:
        return {"status": "Fail", "reason": "access_token missing"}

    conf_man.update_creds_from_metadata_server(request.app)
    results = request.app.config["dyno.conn"].scan(
        i_user_id__eq=session['user_id'])
    table_tpl = {}

    table_tpl['items'] = {}
    print "Jobs: "
    print "-" * 50
    for i, r in enumerate(results):
        table_tpl['items'][i] = {
            "job_id": str(r["job_id"]),
            "status": str(r["status"]),
            "jobtype": str(r["jobtype"]),
            "submit_stamp": str(r["submit_stamp"])
        }

    table_tpl['status'] = "Success"
    return table_tpl
Пример #27
0
def watch_loop(app):

    cloudwatch = get_connection(app)
    while 1:
        status = conf_man.update_creds_from_metadata_server(app)
        if status:
            cloudwatch = get_connection(app)

        for q in app.config["sqs.conn"].get_all_queues():
            q_attr = q.get_attributes()
            visible = q_attr['ApproximateNumberOfMessages']
            inflight = q_attr['ApproximateNumberOfMessagesNotVisible']
            total = visible + inflight
            r = cloudwatch.put_metric_data("SQS",
                                           "ApproximateNumberOfTotalMessages",
                                           value=total,
                                           unit="Count",
                                           dimensions={"QueueName": q.name})
            logging.debug(
                "[{0}] queue:{1} Total:{2} Visible:{3} Inflight:{4}".format(
                    datetime.now().isoformat(), q.name, total, visible,
                    inflight))
            print r
        time.sleep(60)
Пример #28
0
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth):

   # Save current folder and chdir to a temporary folder
   conf_man.update_creds_from_metadata_server(app)
   record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
   
   ##############################################################################
   # Notify job execution start time
   ##############################################################################
   update_record(record, "start_time", time.time())

   ##############################################################################
   # Setup dirs for execution
   ##############################################################################
   cwd    = os.getcwd()
   tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id)
   try:
      os.makedirs(tmpdir)
   except:
      print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir)
      shutil.rmtree(tmpdir)
      os.makedirs(tmpdir)
   os.chdir(tmpdir)


   ##############################################################################
   # Download the inputs to the temp folder
   ##############################################################################
   update_record(record, "status", "staging_inputs")
   stagein_start = time.time()
   try:
      get_inputs(app, inputs, auth)
   except Exception as e:
      print "Exception info : ".format(sys.exc_info()[0])
      update_record(record, "ERROR", "Failed to download inputs {0}".format(e))
      update_record(record, "status", "failed")
      update_record(record, "complete_time", time.time())
      logging.error("Failed to download inputs")
      return False
   stagein_total = time.time() - stagein_start

   ##############################################################################
   # Download the inputs to the temp folder
   ##############################################################################
   # Check if job is valid
   update_record(record, "status", "processing")
   if jobtype not in apps.JOBS:
      logging.error("Jobtype : {0} does not exist".format(jobtype))
      print "Unable to process jobtype : {0}".format(jobtype)
      return False
   print "JOBS : ", apps.JOBS[jobtype]

   status = True
   returncode = 0
   process_start = time.time()
   try:
      returncode = apps.JOBS[jobtype](app, data)
      print "Returncode : {0}".format(returncode)
      conf_man.update_creds_from_metadata_server(app)

   except Exception as e:
      update_record(record, "status", "Failed");
      update_record(record, "complete_time", time.time())
      update_record(record, "ERROR", str(e));
      print "Job execution failed : {0}".format(e)
      status = False
   process_total = time.time() - process_start

   ##############################################################################
   # Upload the results to the S3
   ##############################################################################
   record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
   update_record(record, "status", "staging_outputs")
   stageout_start = time.time()

   # Upload the result to S3
   try:
      put_outputs(app, outputs)
   except Exception as e:
      print "Exception info : ".format(sys.exc_info()[0])
      update_record(record, "ERROR", "Failed to upload outputs {0}".format(e))
      update_record(record, "status", "failed")
      update_record(record, "complete_time", time.time())
      logging.error( "Failed to upload inputs")
      return False
   stageout_total = time.time() - stageout_start

   update_record(record, "z_stagein_dur",    stagein_total)
   update_record(record, "z_stageout_dur",   stageout_total)
   update_record(record, "z_processing_dur", process_total - 1)

   if returncode != 0 :
      update_record(record, "status", "failed");
      update_record(record, "complete_time", time.time())
      update_record(record, "ERROR_CODE", returncode);
      status = False
   else:
      
      update_record(record, "status", "completed")
      update_record(record, "complete_time", time.time())

   if clean_tmp_dirs:
      shutil.rmtree(tmpdir)
   # Chdir back to the original folder
   os.chdir(cwd)
   return True
Пример #29
0
def task_loop(app):
   sqs_conn  = app.config["sqs.conn"]
   pending   = app.config["instance.tags"]["JobsQueueName"]
   active    = app.config["instance.tags"]["ActiveQueueName"]
   pending_q = sqs_conn.get_queue(pending)
   active_q  = sqs_conn.get_queue(active)


   while 1:
      # Wait to read a message from the pending_q
      msg = pending_q.read(wait_time_seconds=20)

      print "Received message from pending_q"
      if msg:         
         # Too many things could fail here, do a blanket
         # Try catch      
         try:

            sreq = json.loads(msg.get_body())["Message"]
            if not sreq :
               continue

            app.config["current_msg_handle"] = msg
            
            data        =  ast.literal_eval(sreq)
            job_id      =  data.get('job_id')
            jobtype     =  data.get('jobtype')
            executable  =  data.get('executable')
            args        =  data.get('args')
            inputs      =  data.get('inputs')
            inputs      =  data.get('inputs')
            outputs     =  data.get('outputs')
            user_auth   =  {"user"      : data.get('i_user_id'),
                            "role"      : data.get('i_user_role'),
                            "token"     : data.get('i_token'),
                            "keyid"     : data.get('i_keyid'),
                            "keysecret" : data.get('i_keysecret')}

            # Post the job to the active queue and delete it from the pending queue
            attr, current_msg = sns_sqs.post_message_to_active(app, active_q, msg.get_body(), job_id)
            print "Posted job from pending to active queue"
            if not pending_q.delete_message(msg):
               print "Deleting message from pending queue failed"
            
            for key in data:
               print "{0} : {1}".format(key, data[key])

            print "Starting task"
            status      =  exec_job(app,
                                    jobtype,
                                    job_id,
                                    executable,
                                    args,
                                    inputs,
                                    outputs,
                                    data,
                                    user_auth)
            
            print "Status : ", status

            if status == True:
               conf_man.send_success_mail(data, app)
            else:
               conf_man.send_failure_mail(data, app)

         except Exception as e:
               print "Job failed to complete : {0}".format(sys.exc_info()[0])
               print "Trace : ", inspect.trace()               

      else:
         print "{0}: Waiting for job description".format(time.time())
         seppukku.die_at_hour_edge(app, dry_run=True)
         logging.debug("{0}: Waiting for job description".format(time.time()))

      conf_man.update_creds_from_metadata_server(app)
Пример #30
0
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data,
             auth):

    # Save current folder and chdir to a temporary folder
    conf_man.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    ##############################################################################
    # Notify job execution start time
    ##############################################################################
    update_record(record, "start_time", time.time())

    ##############################################################################
    # Setup dirs for execution
    ##############################################################################
    cwd = os.getcwd()
    tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id)
    try:
        os.makedirs(tmpdir)
    except:
        print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir)
        shutil.rmtree(tmpdir)
        os.makedirs(tmpdir)
    os.chdir(tmpdir)

    ##############################################################################
    # Download the inputs to the temp folder
    ##############################################################################
    update_record(record, "status", "staging_inputs")
    stagein_start = time.time()
    try:
        get_inputs(app, inputs, auth)
    except Exception as e:
        print "Exception info : ".format(sys.exc_info()[0])
        update_record(record, "ERROR",
                      "Failed to download inputs {0}".format(e))
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        logging.error("Failed to download inputs")
        return False
    stagein_total = time.time() - stagein_start

    ##############################################################################
    # Download the inputs to the temp folder
    ##############################################################################
    # Check if job is valid
    update_record(record, "status", "processing")
    if jobtype not in apps.JOBS:
        logging.error("Jobtype : {0} does not exist".format(jobtype))
        print "Unable to process jobtype : {0}".format(jobtype)
        return False
    print "JOBS : ", apps.JOBS[jobtype]

    status = True
    returncode = 0
    process_start = time.time()
    try:
        returncode = apps.JOBS[jobtype](app, data)
        print "Returncode : {0}".format(returncode)
        conf_man.update_creds_from_metadata_server(app)

    except Exception as e:
        update_record(record, "status", "Failed")
        update_record(record, "complete_time", time.time())
        update_record(record, "ERROR", str(e))
        print "Job execution failed : {0}".format(e)
        status = False
    process_total = time.time() - process_start

    ##############################################################################
    # Upload the results to the S3
    ##############################################################################
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    update_record(record, "status", "staging_outputs")
    stageout_start = time.time()

    # Upload the result to S3
    try:
        put_outputs(app, outputs)
    except Exception as e:
        print "Exception info : ".format(sys.exc_info()[0])
        update_record(record, "ERROR",
                      "Failed to upload outputs {0}".format(e))
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        logging.error("Failed to upload inputs")
        return False
    stageout_total = time.time() - stageout_start

    update_record(record, "z_stagein_dur", stagein_total)
    update_record(record, "z_stageout_dur", stageout_total)
    update_record(record, "z_processing_dur", process_total - 1)

    if returncode != 0:
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        update_record(record, "ERROR_CODE", returncode)
        status = False
    else:

        update_record(record, "status", "completed")
        update_record(record, "complete_time", time.time())

    if clean_tmp_dirs:
        shutil.rmtree(tmpdir)
    # Chdir back to the original folder
    os.chdir(cwd)
    return True
Пример #31
0
def watch_loop(app):
    """
    Watch_loop looks at the definition of the autoscaling_groups and the active queues
    to determine whether :
        1. An instance needs to be removed from the scaling group and terminated
        2. A task has been in the active queue for long and appears to have timed out
           and needs to be moved to the pending queue, for re-attempt.
           Why would a task fail ?
           -> Hard error in task causes worker to fail
           -> Instance was lost mid run
              
    """
    status = conf_man.update_creds_from_metadata_server(app)
    stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"]
    autoscale = get_autoscale_info(app, stack_name)
    print autoscale

    # Select all relevant queues in our cloudformation stack
    queues = [
        q for q in app.config["sqs.conn"].get_all_queues()
        if q.name.startswith(stack_name)
    ]
    # Select only the active queues
    active_q = [q for q in queues if "Active" in q.name]
    pending_q = [q for q in queues if "Active" not in q.name]

    for q in active_q:

        print "Active queue : ", q.name
        qtype = None

        if "Test" in q.name:
            qtype = "test"
        elif "Prod" in q.name:
            qtype = "prod"
        else:
            logging.error("Unknown queue : ".format(q.name))
            break

        # Find the corresponding pending queue to the current active queue
        p_q = None
        p_qs = [pq for pq in pending_q if qtype in pq.name.lower()]
        if len(p_qs) == 1:
            p_q = p_qs[0]
            print "Pending queue : {0}".format(p_q)
        else:
            logging.error("Found too many pending queues : {0}".format(p_qs))
            exit(0)

        print "Instances in this group : ", autoscale[qtype]["instances"]
        for i in autoscale[qtype]["instances"]:
            print i.health_status
            print i.lifecycle_state

        while (1):
            """
            Here we get all messages in the current queue and check the following conditions:
            1. No more messages to check -> Break
            2. If messages exists
            -> Check if it is a kill_request.
            -> Kill the instance and decrement the autoscale group desired count
            -> 
            """
            messages = q.get_messages(num_messages=10,
                                      visibility_timeout=2,
                                      wait_time_seconds=1,
                                      message_attributes=['All'])
            if not messages:
                break

            for msg in messages:
                # Check if message is a kill_request
                if msg.message_attributes["job_id"][
                        "string_value"] == "kill_request":
                    logging.info("Received a kill_request from : ".format(
                        msg.message_attributes["instance_id"]["string_value"]))
                    # Are there more machines than the minimum
                    if autoscale[qtype]["current"] > autoscale[qtype]["min"]:
                        logging.info(
                            "Instances in autoscale group current:{0} > min:{1}"
                            .format(autoscale[qtype]["current"],
                                    autoscale[qtype]["min"]))
                        logging.info("Kill : {0}".format(
                            msg.message_attributes["instance_id"]
                            ["string_value"]))
                        kill_instance(
                            app, msg.message_attributes["instance_id"]
                            ["string_value"], autoscale[qtype])
                        q.delete_message(msg)
                        # Message is a regular job
                    else:
                        # We do not have excess machines. So no kill requests need to be made.
                        # However the message needs to be deleted
                        logging.info("Kill request ignored: {0}".format(
                            msg.message_attributes["instance_id"]
                            ["string_value"]))
                        q.delete_message(msg)

                # If message is not a kill_request it is an active job.
                # Check if job has timed-out or the machine it is running on has
                # accidentally terminated
                else:
                    job_id = msg.message_attributes["job_id"]["string_value"]
                    instance_id = msg.message_attributes["instance_id"][
                        "string_value"]

                    try:
                        record = dutils.dynamodb_get(app.config["dyno.conn"],
                                                     job_id)
                    except Exception, e:
                        logging.debug(
                            "JOb {0} not found in dynamodb \nDeleting the message"
                        )
                        q.delete_message(msg)
                        record = None

                    if record and record["status"] in ["completed", "failed"]:
                        logging.debug(
                            "Job {0} is {1} -> Deleting the active job message"
                            .format(job_id, record["status"]))
                        q.delete_message(msg)
                    else:
                        logging.debug(
                            "Job_id: {0}  Active on Instance: {1}".format(
                                job_id, instance_id))
                        check_job_status(app, msg, job_id, instance_id,
                                         autoscale[qtype], q, p_q)
Пример #32
0
def watch_loop(app):
    """
    Watch_loop looks at the definition of the autoscaling_groups and the active queues
    to determine whether :
        1. An instance needs to be removed from the scaling group and terminated
        2. A task has been in the active queue for long and appears to have timed out
           and needs to be moved to the pending queue, for re-attempt.
           Why would a task fail ?
           -> Hard error in task causes worker to fail
           -> Instance was lost mid run
              
    """
    status     = conf_man.update_creds_from_metadata_server(app)
    stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"]    
    autoscale  = get_autoscale_info(app, stack_name)
    print autoscale

    # Select all relevant queues in our cloudformation stack
    queues     = [q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name)]
    # Select only the active queues
    active_q   = [q for q in queues if "Active" in q.name]
    pending_q  = [q for q in queues if "Active" not in q.name]

    for q in active_q:

        print "Active queue : ", q.name
        qtype = None
        
        if "Test" in q.name:
            qtype = "test"
        elif "Prod" in q.name:
            qtype = "prod"
        else:
            logging.error("Unknown queue : ".format(q.name))
            break

        # Find the corresponding pending queue to the current active queue
        p_q   = None
        p_qs = [pq for pq in pending_q if qtype in pq.name.lower()]
        if len(p_qs) == 1:
            p_q = p_qs[0]
            print "Pending queue : {0}".format(p_q)
        else:
            logging.error("Found too many pending queues : {0}".format(p_qs))
            exit(0)
                        
        print "Instances in this group : ", autoscale[qtype]["instances"]        
        for i in autoscale[qtype]["instances"]:
            print i.health_status
            print i.lifecycle_state


        while (1):
            """
            Here we get all messages in the current queue and check the following conditions:
            1. No more messages to check -> Break
            2. If messages exists
            -> Check if it is a kill_request.
            -> Kill the instance and decrement the autoscale group desired count
            -> 
            """
            messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All'])                
            if not messages:
                break

            for msg in messages:
                # Check if message is a kill_request
                if msg.message_attributes["job_id"]["string_value"] == "kill_request":
                    logging.info("Received a kill_request from : ".format(msg.message_attributes["instance_id"]["string_value"]))
                    # Are there more machines than the minimum
                    if autoscale[qtype]["current"] > autoscale[qtype]["min"]:
                        logging.info("Instances in autoscale group current:{0} > min:{1}".format(autoscale[qtype]["current"], autoscale[qtype]["min"]))
                        logging.info("Kill : {0}".format(msg.message_attributes["instance_id"]["string_value"]))
                        kill_instance(app, msg.message_attributes["instance_id"]["string_value"], autoscale[qtype])                                    
                        q.delete_message(msg)
                        # Message is a regular job
                    else:
                        # We do not have excess machines. So no kill requests need to be made.
                        # However the message needs to be deleted
                        logging.info("Kill request ignored: {0}".format(msg.message_attributes["instance_id"]["string_value"]))
                        q.delete_message(msg)

                # If message is not a kill_request it is an active job.
                # Check if job has timed-out or the machine it is running on has
                # accidentally terminated 
                else:
                    job_id      = msg.message_attributes["job_id"]["string_value"]
                    instance_id = msg.message_attributes["instance_id"]["string_value"]

                    try:
                        record      = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
                    except Exception, e:
                        logging.debug("JOb {0} not found in dynamodb \nDeleting the message")
                        q.delete_message(msg)                
                        record      = None
                        
                    if record and record["status"] in ["completed", "failed"]:
                        logging.debug("Job {0} is {1} -> Deleting the active job message".format(job_id, record["status"]))
                        q.delete_message(msg)
                    else:
                        logging.debug("Job_id: {0}  Active on Instance: {1}".format(job_id, instance_id))
                        check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
Пример #33
0
>>>>>>> 856f3c026f2bf7cae078da071f7353178bc5f27b
                                                           monitoring_enabled=False,
                                                           instance_profile_arn=role,
                                                           dry_run=DRY_RUN)


if __name__ == "__main__":
    mappings = load_mapping_csvs("ami_mapping.csv")

    app = bottle.default_app()
    try:
        app.config.load_config("production.conf")
    except Exception as e:
        logging.error("Exception {0} in load_config".format(e))
        exit(-1)
    cm.update_creds_from_metadata_server(app)

    #instances = ["m4.10xlarge", "c4.8xlarge", "m4.large", "m4.xlarge", "c4.xlarge" ]
#    instances = ["m4.10xlarge", "c4.8xlarge"] # "m4.large", "m4.xlarge", "c4.xlarge" ]
    instances = ["c4.8xlarge"] # "m4.large", "m4.xlarge", "c4.xlarge" ]

    for instance in instances :
        for m in mappings:
            print m["region_code"]
            cm.init(app, m["region_code"])
            print app.config["ec2.conn"]
            status = start_instance(app, m["ami"], instance)
            print "{0} {1} {2}".format(m["region_code"], instance, status)
<<<<<<< HEAD
            break;
=======
Пример #34
0
def task_loop(app):
    sqs_conn = app.config["sqs.conn"]
    pending = app.config["instance.tags"]["JobsQueueName"]
    active = app.config["instance.tags"]["ActiveQueueName"]
    pending_q = sqs_conn.get_queue(pending)
    active_q = sqs_conn.get_queue(active)

    while 1:
        # Wait to read a message from the pending_q
        msg = pending_q.read(wait_time_seconds=20)

        print "Received message from pending_q"
        if msg:
            # Too many things could fail here, do a blanket
            # Try catch
            try:

                sreq = json.loads(msg.get_body())["Message"]
                if not sreq:
                    continue

                app.config["current_msg_handle"] = msg

                data = ast.literal_eval(sreq)
                job_id = data.get('job_id')
                jobtype = data.get('jobtype')
                executable = data.get('executable')
                args = data.get('args')
                inputs = data.get('inputs')
                inputs = data.get('inputs')
                outputs = data.get('outputs')
                user_auth = {
                    "user": data.get('i_user_id'),
                    "role": data.get('i_user_role'),
                    "token": data.get('i_token'),
                    "keyid": data.get('i_keyid'),
                    "keysecret": data.get('i_keysecret')
                }

                # Post the job to the active queue and delete it from the pending queue
                attr, current_msg = sns_sqs.post_message_to_active(
                    app, active_q, msg.get_body(), job_id)
                print "Posted job from pending to active queue"
                if not pending_q.delete_message(msg):
                    print "Deleting message from pending queue failed"

                for key in data:
                    print "{0} : {1}".format(key, data[key])

                print "Starting task"
                status = exec_job(app, jobtype, job_id, executable, args,
                                  inputs, outputs, data, user_auth)

                print "Status : ", status

                if status == True:
                    conf_man.send_success_mail(data, app)
                else:
                    conf_man.send_failure_mail(data, app)

            except Exception as e:
                print "Job failed to complete : {0}".format(sys.exc_info()[0])
                print "Trace : ", inspect.trace()

        else:
            print "{0}: Waiting for job description".format(time.time())
            seppukku.die_at_hour_edge(app, dry_run=True)
            logging.debug("{0}: Waiting for job description".format(
                time.time()))

        conf_man.update_creds_from_metadata_server(app)