def cancel_spot_instance_requests(self, to_kill): """ remove requests in this list. """ for kill in to_kill: logger.debug("SIMULATION: Killing requests %s" % kill) self.requests = [r for r in self.requests if r.reqid != kill]
def cancel_unnecessary_requests(tenants): """ Make sure spot requests are closed if there are no idle jobs in the queue. """ for tenant in tenants: # start by grabbing all of the open spot requests for this tenant conn = boto.connect_ec2(tenant.access_key, tenant.secret_key) reqs = conn.get_all_spot_instance_requests(filters={ "tag-value": tenant.name, "state": "open" }) # That should be sufficient, but just because spot requests are # scary lets double check and kill anything if there are no idle # jobs. status = [] for job in tenant.jobs: # Remove any jobs that have now been fulfilled too if job.fulfilled is False: status.append(job.status) # If there are no jobs currently idle, terminate any outstanding # spot requests if not all(stat == '1' for stat in status) or len(status) == 0: # Reorder these requests to a usable array id_to_req = request_ids_dict(reqs) # Build a list of requests to cancel to_cancel = id_to_req.keys() # Cancel these requests if len(to_cancel) > 0: logger.error("This should be deprecated if the other " + "cancel function is working correctly.") logger.debug("Cancelling spot requests: %s" % to_cancel) conn.cancel_spot_instance_requests(to_cancel)
def cancel_unmigrated_requests(tenants): """ There are two cases to handle here. Either there are no idle jobs, so all requests should be cancelled. Or there are idle jobs but the existing requests could not be migrated to them. In this case, any orphaned requests should also be cancelled. """ for tenant in tenants: # start by grabbing all of the open spot requests for this tenant ids_to_check = ProvisionerConfig().simulator.get_open_requests() # Get the set of idle job numbers idle_job_numbers = [] for job in tenant.jobs: if job.sim_status == 'IDLE': idle_job_numbers.append(job.id) # now get all of the orphaned requests reqs = get_orphaned_requests(tenant, ids_to_check, idle_job_numbers) reqs_to_cancel = [] # build a nice list we can pass to boto for req in reqs: reqs_to_cancel.append(req['request_id']) # now cancel all of these requests try: if len(reqs_to_cancel) > 0: logger.debug("Cancelling unmigrated requests: %s" % reqs_to_cancel) ProvisionerConfig().simulator.cancel_spot_instance_requests( ids_to_check) except Exception as e: logger.exception("Error removing spot instance requests.") raise e
def deploy_job(self, job): current_time = ProvisionerConfig().simulate_time instance_types = ProvisionerConfig().instance_types for resource in self.resources: if resource.state == "IDLE": for instance in instance_types: # check that it fits this instance if (resource.type == instance.type and self.check_requirements(instance, job)): # this is now good, so lets put it on there. resource.job_id = job.id # set the time for the job to finish # first convert the exec time to the instance exec_seconds = self.exec_time(job, resource.type) logger.debug("SIMULATION CONDOR: Deploying " + "job %s to resource %s for %s" % (job.id, resource.id, exec_seconds)) # convert the jobs request time into a timestamp req_time = job.req_time ProvisionerConfig().dbconn.execute( ("insert into jobs (test, job_id, start_time, " "req_time) values ('%s', %s, '%s', '%s');" % (ProvisionerConfig().run_name, int( job.id), self.get_fake_time(), req_time))) resource.job_finish = current_time + \ datetime.timedelta(seconds=exec_seconds) resource.state = "EXECUTING" job.sim_status = "EXECUTING" self.executing_jobs = self.executing_jobs + [job.id] return
def migrate_requests(tenants): """ If requests exist for a job that is no longer in the idle queue (e.g. it has been fulfilled or scheduled on other resources) then migrate any outstanding requests to another job in the idle queue. If there are no other jobs in the idle queue, cancel all existing requests tagged by a tenant. """ for tenant in tenants: conn = boto.connect_ec2(tenant.access_key, tenant.secret_key) reqs = conn.get_all_spot_instance_requests(filters={ "tag-value": tenant.name, "state": "open" }) # Get a list of ids that can be used in a db query ids_to_check = [] for r in reqs: ids_to_check.append("%s" % r.id) logger.debug("Open requests: %s" % ids_to_check) # Get the set of idle job numbers - using tenant.idle_jobs may not # work, as some jobs are removed from the list for various reasons # e.g. they have recently had a request made for them, so instead # we will use the all_jobs list and check for idle state idle_job_numbers = [] potential_jobs = [] for job in tenant.jobs: if job.status == '1': idle_job_numbers.append(job.id) potential_jobs.append(job) # Get requests that do not belong to idle jobs reqs = get_orphaned_requests(tenant, ids_to_check, idle_job_numbers) # if there are any requests, try to reassign them to another job if len(reqs) > 0: for req in reqs: for job in potential_jobs: # try to migrate it. if it works, then go to the next # request. otherwise try the next job. if migrate_request_to_job(req, job): # Remove it from idle jobs so it doesn't also get # a request made for it this round if job in tenant.idle_jobs: tenant.idle_jobs.remove(job) break
def process_resources(tenants): """ This should manage all of the existing aws resources and requests. """ # Update the DB with newly fulfilled instances logger.debug("Processing sim requests.") update_database(tenants) # Migrate any requests that still exist for a resource that is not # going to use them migrate_reqs = True if migrate_reqs: migrate_requests(tenants) # Stop any unnecessary spot requests (still launching without any idle # jobs) cancel_unmigrated_requests(tenants)
def process_idle_jobs(self, tenants): """ Go through and ignore fulfilled jobs etc. """ t1 = datetime.datetime.now() ignore_fulfilled_jobs(tenants) t2 = datetime.datetime.now() # Stop resources being requested too frequently stop_over_requesting(tenants) t3 = datetime.datetime.now() ig_time = (t2 - t1).total_seconds() over_time = (t3 - t2).total_seconds() logger.debug("SIMULATION load times: ignore (%s), over req (%s)" % (ig_time, over_time))
def request_spot_instances(self, price, image_id, subnet_id, count, key_name, security_group_ids, instance_type, user_data, block_device_map, job): # this needs to make a request, not an instance. then somehow i # need to translate requests to instances after a little while. simid = "%s-sim-req-%s" % (ProvisionerConfig().run_name, self.reqid) sleep_time = float(random.choice(self.fulfilled_time_dist)) self.reqid = self.reqid + 1 new_request = SimRequest(price, subnet_id, instance_type, simid, int(sleep_time), job.id) self.requests.append(new_request) # moved this sleep to a different spot so now the requests are # done as a batch too # time.sleep(ProvisionerConfig().overhead_time) logger.debug("SIMULATION: creating new request %s - sleep for %s" % (new_request, sleep_time)) return [simid]
def cancel_unmigrated_requests(tenants): """ There are two cases to handle here. Either there are no idle jobs, so all requests should be cancelled. Or there are idle jobs but the existing requests could not be migrated to them. In this case, any orphaned requests should also be cancelled. """ for tenant in tenants: # start by grabbing all of the open spot requests for this tenant conn = boto.connect_ec2(tenant.access_key, tenant.secret_key) reqs = conn.get_all_spot_instance_requests(filters={ "tag-value": tenant.name, "state": "open" }) # Get a list of ids that can be used in a db query ids_to_check = [] for r in reqs: ids_to_check.append("%s" % r.id) # Get the set of idle job numbers idle_job_numbers = [] for job in tenant.jobs: if job.status == '1': idle_job_numbers.append(job.id) # now get all of the orphaned requests reqs = get_orphaned_requests(tenant, ids_to_check, idle_job_numbers) reqs_to_cancel = [] # build a nice list we can pass to boto for req in reqs: reqs_to_cancel.append(req['request_id']) # now cancel all of these requests try: if len(reqs_to_cancel) > 0: logger.debug("Cancelling unmigrated requests: %s" % reqs_to_cancel) conn.cancel_spot_instance_requests(ids_to_check) except Exception as e: logger.exception("Error removing spot instance requests.") raise e
def request_resources(tenant): """ Request the resources that have been selected for each job """ conn = None output_string = "Name: %s\n" % tenant.name output_string = "%sTenant: %s\n" % (output_string, tenant.name) instance_req_string = "" req_cpus = 0 req_instances = 0 for job in tenant.idle_jobs: if job.fulfilled is False: request = job.launch if request is None: logger.debug("Failed to find request object for job %s" % job) continue # increment some counters req_instances += int(request.count) req_cpus += int(job.req_cpus) # Launch any on-demand requests if request.ondemand: # launch the ondemand request launch_ondemand_request(conn, request, tenant, job) instance_req_string = ( ("%sONDEMAND_INSTANCE_REQUEST" + "\t%s\t%s\t%s\t%s\t%s\n") % (instance_req_string, tenant.name, request.instance_type, request.bid, job.id, "ondemand")) else: # launch the spot request # TODO batch request instances of the same type req_ids = launch_spot_request(conn, request, tenant, job) for req in req_ids: instance_req_string = ( ("%sSPOT_INSTANCE_REQUEST" + "\t%s\t%s\t%s\t%s\t%s\tDrAFTS: %s\t%s\n") % (instance_req_string, tenant.name, request.instance_type, request.bid, job.id, "spot", request.DrAFTS, req))
def load_drafts_data(self): """ To speed this up, load in all the drafts data once per provisioning cycle """ cur_time = datetime.datetime.utcnow() if ProvisionerConfig().simulate: cur_time = ProvisionerConfig().simulator.get_fake_time() minus_ten = cur_time - datetime.timedelta(seconds=600) query = ("select * from drafts_price where timestamp < " "'%s'::TIMESTAMP and timestamp > '%s'::TIMESTAMP") % ( cur_time.strftime("%Y-%m-%d %H:%M"), minus_ten.strftime("%Y-%m-%d %H:%M")) self.drafts_data = [] logger.debug('getting drafts data: ' + query) rows = ProvisionerConfig().dbconn.execute(query) for row in rows: data = {'time': row['time'], 'price': row['price'], 'zone': row['zone'], 'type': row['type']} self.drafts_data.append(data)
def instance_acquired(inst, request, tenant, conn): """ A new instance has been acquired, so insert a record into the instance table and tag it with the tenant name """ launch_time = datetime.datetime.strptime(inst.launch_time, "%Y-%m-%dT%H:%M:%S.000Z") # insert it into the database ProvisionerConfig().dbconn.execute( ("insert into instance (request_id, instance_id, fulfilled_time, " + "public_dns, private_dns) values ('%s', '%s', '%s', '%s', '%s')") % (request['id'], inst.id, launch_time, inst.public_dns_name, inst.private_dns_name)) logger.debug("An instance has been acquired. " + "Tenant={0}; Request={1}, Instance={2}".format( tenant.name, repr(request), repr(inst))) # Update the launch stats table too. update_launch_stats(inst, request, conn) # now tag the request api.tag_requests(inst.id, tenant.name, conn) # if the job is still in the idle queue, we should remove it as the # instance was now launched for it for job in tenant.jobs: logger.debug("Checking {0} vs {1}".format(repr(job), repr(request))) if int(job.id) == int(request['job_runner_id']): logger.debug("Launched an instance for job %s - removing it." % request['job_runner_id']) job.fulfilled = True
def get_global_queue(self): """ Read in the jobs that should have started prior to the current sim time. Create a new job object for each then return a list of them. """ if self.job_data is None: with open(ProvisionerConfig().jobs_file) as data_file: logger.debug("SIMULATION: READING DATA") self.job_data = json.load(data_file) # NOTE: this now doesn't work for multiple tenants as this # is self.jobs. change it back # to read the full file over and over if i want tenants. # Work out how many seconds have passed since starting the test rel_time = (ProvisionerConfig().simulate_time - ProvisionerConfig().sim_time).total_seconds() to_delete = [] for j in self.job_data: if int(j['relative_time']) < rel_time: to_delete.append(j) description = {} description['instype'] = j['instance_type'] description['duration'] = float(j['duration']) req_time = ProvisionerConfig().sim_time + \ datetime.timedelta(seconds=int(j['relative_time'])) newjob = Job('tenant_addr', "%s%s" % (j['id'], ProvisionerConfig().run_id), 1, req_time, 1, 1, 1, description) self.jobs.append(newjob) else: break for j in to_delete: self.job_data.remove(j) return self.jobs
def instance_acquired(inst, request, tenant, conn): """ A new instance has been acquired, so insert a record into the instance table and tag it with the tenant name """ launch_time = ProvisionerConfig().simulator.get_fake_time() # insert it into the database ProvisionerConfig().dbconn.execute( ("insert into instance (request_id, instance_id, fulfilled_time, " + "public_dns, private_dns) values ('%s', '%s', '%s', '%s', '%s')") % (request['id'], inst.id, launch_time, 'pubdns', 'privdns')) logger.debug("An instance has been acquired. " + "Tenant={0}; Request={1}, Instance={2}".format( tenant.name, repr(request), repr(inst))) # now tag the request api.tag_requests(inst.id, tenant.name, conn) # if the job is still in the idle queue, we should remove it as the # instance was now launched for it for job in tenant.jobs: logger.debug("Checking {0} vs {1}".format(repr(job), repr(request))) if job.id == request['job_runner_id']: logger.debug("Launched an instance for job %s - removing it." % request['job_runner_id']) job.fulfilled = True
def process_global_queue(self, jobs, tenants): """ Associate each job with a tenant and add them to their local list of jobs. """ for tenant in tenants: tenant.jobs = [] tenant.idle_jobs = [] # Get the necessary time a job must be idle as a timestamp for # each tenant # Go through the jobs and only add those that are old enough and # are in the idle state for job in jobs: tenant.jobs.append(job) job_idle_at = job.req_time + \ datetime.timedelta(seconds=tenant.idle_time) if (int(job.status) == 1 and job_idle_at < ProvisionerConfig().simulate_time): tenant.idle_jobs.append(job) logger.debug("SIMULATION: job len = %s" % len(tenant.jobs))
def load_jobs(self, tenants): """ Read in the condor queue and manage the removal of jobs that should not be processed. """ # Assess the global queue # Clear out the lists then reload them. for t in tenants: t.idle_jobs = [] t.jobs = [] t1 = datetime.datetime.now() all_jobs = self.get_global_queue() t2 = datetime.datetime.now() if ProvisionerConfig().simulate: if ProvisionerConfig().relative_time is None: self.job_data = None utc = timezone('UTC') ProvisionerConfig().relative_time = datetime.datetime.now(utc) # Assoicate the jobs from the global queue with each of the tenants self.process_global_queue(all_jobs, tenants) t3 = datetime.datetime.now() ignore_fulfilled_jobs(tenants) t4 = datetime.datetime.now() # Stop resources being requested too frequently stop_over_requesting(tenants) t5 = datetime.datetime.now() queue_time = (t2 - t1).total_seconds() process_time = (t3 - t2).total_seconds() ignore_time = (t4 - t3).total_seconds() stop_time = (t5 - t4).total_seconds() logger.debug("SIMULATION load times: queue (%s), process (%s), " "ignore (%s), stop (%s)" % (queue_time, process_time, ignore_time, stop_time))
def migrate_request_to_job(request, job): """ Check if an instance can be repurposed to another job and update the database. """ # Check to see if the job can be fulfilled by the requested instance if check_requirements(request['type'], job): next_idle_job_id = job.id try: logger.debug( ("Migrating instance request %s, from job " + "%s to job %s.") % (request['id'], request['job_runner_id'], next_idle_job_id)) ProvisionerConfig().dbconn.execute( ("update instance_request set job_runner_id = '%s' " + "where id = %s") % (next_idle_job_id, request['id'])) ProvisionerConfig().dbconn.execute( ("insert into request_migration " + "(request_id, from_job, to_job, migration_time) " + "values (%s, %s, %s, NOW())") % (request['id'], request['job_runner_id'], next_idle_job_id)) return True except psycopg2.Error: logger.exception("Error performing migration in database.")
def run_condor(self, tenants): """ Be the condor agent. This will manage putting jobs on the resources etc. """ logger.debug("SIMULATION CONDOR: starting.") instance_types = ProvisionerConfig().instance_types current_time = ProvisionerConfig().simulate_time # logger.debug("SIMULATION CONDOR: loaded tenants.") # now i need to add status to each of the jobs # Run through the jobs and set their states so they # are ignored by other things for t in tenants: for job in list(t.jobs): if job.id in self.finished_jobs: job.sim_status = "FINISHED" if job in t.idle_jobs: t.idle_jobs.remove(job) t.jobs.remove(job) elif job.id in self.executing_jobs: job.sim_status = "EXECUTING" if job in t.idle_jobs: t.idle_jobs.remove(job) for t in tenants: for job in t.jobs: for resource in self.resources: if (job.id == resource.job_id and resource.job_finish is not None and resource.job_finish < current_time): # Mark it as all done job.sim_status = "FINISHED" resource.state = "IDLE" if job.id in self.executing_jobs: self.executing_jobs.remove(job.id) if job.id not in self.finished_jobs: self.finished_jobs = self.finished_jobs + \ [job.id] logger.debug("SIMULATION CONDOR: Finished " + "job %s." % (job.id)) # resource.state = "IDLE" ProvisionerConfig().dbconn.execute( ("update jobs set end_time = '%s' " + "where job_id = %s and test = '%s';") % (ProvisionerConfig().simulate_time, int(job.id), ProvisionerConfig().run_name)) logger.debug("SIMULATION CONDOR: deploying new jobs.") for t in tenants: for job in t.jobs: # check if it can fit on the instance if job.sim_status == "IDLE": self.deploy_job(job)
def get_timeout_ondemand(self, job, tenant, instances): """ Check to see if the job now requires an ondemand instance due to timing out. """ cur_time = datetime.datetime.now() cur_time = calendar.timegm(cur_time.timetuple()) time_idle = 0 if ProvisionerConfig().simulate: cur_time = ProvisionerConfig().simulate_time time_idle = (ProvisionerConfig().simulate_time - job.req_time).total_seconds() else: time_idle = cur_time - int(job.req_time) res_instance = None # if the tenant has set a timeout and the job has been idle longer than # this if tenant.timeout > 0 and time_idle > tenant.timeout: # sort the eligibile instances by their ondemand price (odp) sorted_instances = sorted(instances, key=lambda k: k.odp) logger.debug("Selecting ondemand instance: %s" % str(job.launch)) res_instance = sorted_instances[0] return res_instance
def check_ondemand_needed(self, tenant, sorted_instances, job): # Check to see if an ondemand instance is required due to timeout needed = False launch_instance = self.get_timeout_ondemand(job, tenant, sorted_instances) cheapest = sorted_instances[0] # check to see if it timed out if (launch_instance is not None and launch_instance.odp < tenant.max_bid_price): job.launch = aws.Request( launch_instance, launch_instance.type, "", launch_instance.ami, 1, launch_instance.odp, True) logger.debug("Selected to launch on demand due to timeout: %s" % str(job.launch)) needed = True # check if the job is flagged as needing on-demand elif job.ondemand: needed = True # if the cheapest option is ondemand elif cheapest.ondemand and cheapest.odp < tenant.max_bid_price: job.launch = cheapest logger.debug("Selected to launch on demand due to ondemand " "being cheapest: %s" % repr(cheapest)) needed = True # or if the cheapest option close in price to ondemand, then use # ondemand. elif (cheapest.price > (ProvisionerConfig().ondemand_price_threshold * float(cheapest.odp)) and cheapest.price < tenant.max_bid_price): job.launch = cheapest logger.debug("Selected to launch on demand due to spot price " "being close to ondemand price: %s" % repr(cheapest)) needed = True return needed
def migrate_instance(): """ A placeholder for where the migration of instances will fit in to this. """ logger.debug("Migration not yet supported.")
def launch_spot_request(conn, request, tenant, job): try: logger.debug("%s = %s. tenants vpc = %s" % (request.zone, tenant.subnets[request.zone], tenant.vpc)) cost_aware_req = job.cost_aware drafts_req = job.cost_aware drafts_avg = job.cost_aware mapping = BlockDeviceMapping() sda1 = BlockDeviceType() eph0 = BlockDeviceType() eph1 = BlockDeviceType() eph2 = BlockDeviceType() eph3 = BlockDeviceType() sda1.size = 10 eph0.ephemeral_name = 'ephemeral0' eph1.ephemeral_name = 'ephemeral1' eph2.ephemeral_name = 'ephemeral2' eph3.ephemeral_name = 'ephemeral3' mapping['/dev/sda1'] = sda1 mapping['/dev/sdb'] = eph0 mapping['/dev/sdc'] = eph1 mapping['/dev/sdd'] = eph2 mapping['/dev/sde'] = eph3 inst_req = None inst_req = conn.request_spot_instances( price=request.bid, image_id=request.ami, subnet_id=tenant.subnets[request.zone], count=request.count, key_name=tenant.key_pair, security_group_ids=[tenant.security_group], instance_type=request.instance_type, user_data=customise_cloudinit(tenant, job), block_device_map=mapping) my_req_ids = [req.id for req in inst_req] # address = "" for req in my_req_ids: insert_launch_stats(req, request, tenant) # tag each request tag_requests(req, tenant.name, conn) ProvisionerConfig().dbconn.execute(( "insert into instance_request (tenant, instance_type, " + "price, job_runner_id, request_type, request_id, " + "subnet, cost_aware_ins, cost_aware_bid, cost_aware_subnet," + " drafts_ins, drafts_bid, drafts_subnet, selected_avg_price," " cost_aware_avg_price, drafts_avg_price, drafts_avg_ins, " + "drafts_avg_bid, drafts_avg_subnet, drafts_avg_avg_price) " + "values ('%s', '%s', %s, %s, '%s', '%s', %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" ) % (tenant.db_id, request.instance.db_id, request.price, job.id, "spot", req, tenant.subnets_db_id[request.zone], cost_aware_req.instance.db_id, cost_aware_req.bid, tenant.subnets_db_id[cost_aware_req.zone], drafts_req.instance.db_id, drafts_req.DrAFTS, tenant.subnets_db_id[drafts_req.zone], request.AvgPrice, cost_aware_req.AvgPrice, drafts_req.AvgPrice, drafts_avg.instance.db_id, drafts_avg.DrAFTS, tenant.subnets_db_id[drafts_avg.zone], drafts_avg.AvgPrice)) return my_req_ids except boto.exception.EC2ResponseError: logger.exception("There was an error communicating with EC2.")
def get_global_queue(self): """ Poll condor_q -global and return a set of Jobs. """ cmd = [ 'condor_q', '-global', '-format', '%s:', 'GlobalJobId', '-format', '%s:', 'ClusterId', '-format', '%s:', 'JobStatus', '-format', '%s:', 'QDate', '-format', '%s:', 'RequestCpus', '-format', '%s:', 'RequestMemory', '-format', '%s:', 'RequestDisk', '-format', '%s', 'JobDescription', '-format', '%s\n', 'ExitStatus' ] #output = subprocess.check_output(cmd) output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] queue = output.split("\n") queue = filter(None, queue) jobs = [] if len(queue) > 0: # set the time of the first job if this is it if ProvisionerConfig().first_job_time is None: logger.debug("Simulation: first job time set") utc = timezone('UTC') ProvisionerConfig().first_job_time = datetime.datetime.now(utc) for line in queue: if "All queues are empty" in line: break try: split = line.split(":") tenant_addr = "" # Grab the address of the tenant from the global id if "#" in split[0]: tenant_addr = split[0].split("#")[0] # Req memory is either a number or a string talking about # requested memory, so check if it is a number req_memory = 0 try: req_memory = int(split[5]) if req_memory > 1024: # change it to use GB like instance types. req_memory = req_memory / 1024 except Exception, e: pass # Req disk is the same as memory. Again it is # in mb I believe req_disk = 0 try: req_disk = int(split[6]) if req_disk > 1024: # change it to use GB like instance types. req_disk = req_disk / 1024 except Exception, e: pass # Decipher the description of the job as well (name, etc.) description = {} if "=" in split[7]: description = self.process_job_description(split[7]) # Create the job: tenant address, job id, queue time, # requested cpus, requested memory j = Job(tenant_addr, split[1], split[2], split[3], split[4], req_memory, req_disk, description) jobs.append(j) except Exception, e: logger.exception("Something has gone wrong while" " processing " "the job queue.") raise e
def get_potential_instances(self, eligible_instances, job, tenant): """ Make a list of all <type,zone> and <type,ondemand> pairs then order them. """ # Putting this here so it isn't called every run # commented out to stop it checking drafts prices unsorted_instances = [] # Add an entry for each instance type as ondemand, or each spot # price so we can sort everything and pick the cheapest. for ins in eligible_instances: unsorted_instances.append(aws.Request( ins, ins.type, "", ins.ami, 1, 0, True, ins.ondemand, ins.ondemand, ins.ondemand, ins.ondemand, ins.ondemand)) # Don't bother adding spot prices if it is an ondemand request: if not job.ondemand: DrAFTS = None AvgPrice = None OraclePrice = None for zone, price in ins.spot.iteritems(): # if zone == 'us-east-1c': if (ProvisionerConfig().DrAFTS or ProvisionerConfig().DrAFTSProfiles): DrAFTS, OraclePrice = self.get_DrAFTS_bid( ins.type, zone, job, price) if DrAFTS is None or OraclePrice is None: # try it again, if it doesn't find them its # because the price doesn't exist. so add a big # value to skip it DrAFTS, OraclePrice = self.get_DrAFTS_bid( ins.type, zone, job, price) if DrAFTS is None: DrAFTS = 1000 if OraclePrice is None: OraclePrice = 1000 if ProvisionerConfig().DrAFTS: unsorted_instances.append(aws.Request( ins, ins.type, zone, ins.ami, 1, 0, False, ins.ondemand, DrAFTS, 0, 0, 0)) elif ProvisionerConfig().DrAFTSProfiles: unsorted_instances.append(aws.Request( ins, ins.type, zone, ins.ami, 1, 0, False, ins.ondemand, OraclePrice, 0, 0, 0)) else: unsorted_instances.append(aws.Request( ins, ins.type, zone, ins.ami, 1, 0, False, ins.ondemand, price, 0, 0, 0)) logger.debug('%s, %s spot: %s drafts: %s profile: %s' % ( ins.type, zone, price, DrAFTS, OraclePrice)) # Now sort all of these instances by price sorted_instances = [] # Adding and false here to force it to use the cheapest price for now. if ProvisionerConfig().DrAFTS: # This should sort by the drafts price and then by the current # spot price that way we will get the cheapest AZ at the top of # the list. sorted_instances = sorted(unsorted_instances, key=lambda k: (k.DrAFTS, k.price)) if ProvisionerConfig().DrAFTSProfiles: sorted_instances = sorted(unsorted_instances, key=lambda k: (k.OraclePrice, k.price)) else: sorted_instances = sorted( unsorted_instances, key=lambda k: k.price) return sorted_instances
def get_DrAFTS_bid(self, ins, zone, job, cur_price): """ Pull the DrAFTS price for this instance type. This will get the nearest value greater than 1 hour. """ # example: http://128.111.84.183/vpc/us-east-1a-c3.2xlarge.pgraph try: ret_drafts = None ret_oracle = None if ProvisionerConfig().drafts_stored_db: # clear the tenant's current avg prices mapped_zone = self.drafts_mapping[zone] logger.debug('drafts zone: %s' % mapped_zone) for row in self.drafts_data: if (row['type'] == ins and mapped_zone == row['zone'] and float(row['price']) > float(cur_price)): time = row['time'] cost = row['price'] if ret_drafts is None and float(time) > 1: ret_drafts = Decimal(str(cost)) if (ret_oracle is None and float(time) > (float(job.duration) / 3600)): ret_oracle = Decimal(str(cost)) return ret_drafts, ret_oracle else: # use the mapping between AZs to pick a zone name mapped_zone = self.drafts_mapping[zone] addr = 'http://128.111.84.183/vpc/%s-%s.pgraph' % ( mapped_zone, ins) req = requests.get(addr) output = req.text # Split the result by line lines = output.split("\n") ret_drafts = None # define these out here so if it goes over the line, # when the request length is too long, it can use the # previous ones. cost = None time = None for line in lines: # Extract the time and cost try: time = line.split(" ")[0] cost = line.split(" ")[1] except Exception, y: logger.error("drafts: Failed here: %s %s" % (y, line)) # Split the line in half to get the time and cost if float(time) > 1: # this is the one we want to use ret_drafts = Decimal(str(cost)) break # now do the oracle ones ret_oracle = None last = False for line in lines: # Extract the time and cost try: if len(line) > 5: time = line.split(" ")[0] cost = line.split(" ")[1] else: last = True logger.debug("No prediction long enough in " "%s, using last one. %s %s" % (addr, time, cost)) except Exception, z: logger.error("oracle: failed here: %s %s" % (z, line)) # Split the line in half to get the time and cost if last or float(time) > (float(job.duration) / 3600): # this is the one we want to use ret_oracle = Decimal(str(cost)) break return ret_drafts, ret_oracle
def simulate(self, _tenants): """ check the state of the simulation. """ self.tenants = _tenants jobs_list = [] idle_jobs = [] for t in self.tenants: for job in t.jobs: jobs_list = jobs_list + [job.id] if (job.sim_status == "IDLE" and job.id not in self.finished_jobs and job.id not in self.executing_jobs): idle_jobs = idle_jobs + [job.id] # get some counts to print out terminated_time_instances = [] terminated_price_instances = [] starting_instances = [] unclaimed_instances = [] idle_instances = [] executing_instances = [] for res in self.resources: if res.state == "IDLE": idle_instances = idle_instances + [res.id] elif res.state == "EXECUTING": executing_instances = executing_instances + [res.id] elif res.state == "STARTING" or res.state == "CONTEXTUALIZING": starting_instances = starting_instances + [res.id] elif res.state == "UNCLAIMED": unclaimed_instances = unclaimed_instances + [res.id] elif res.state == "TERMINATED": if 'time' in res.reason: terminated_time_instances = terminated_time_instances + \ [res.id] if 'price' in res.reason: terminated_price_instances = terminated_price_instances + \ [res.id] logger.debug("\nSIMULATION OVERVIEW: requests (cur: %s -- total: %s), " "resources (%s), jobs (%s)\n" % (len(self.requests), self.reqid - 1, len( self.resources), len(jobs_list))) logger.debug("\nSIMULATION JOB OVERVIEW: idle (%s), executing (%s), " "finished (%s)\n" % (len(idle_jobs), len( self.executing_jobs), len(self.finished_jobs))) logger.debug( "\nSIMULATION RESOURCE OVERVIEW: starting (%s), " "idle (%s), unclaimed (%s), executing (%s), " "terminated-time (%s), terminated-price (%s)\n" % (len(starting_instances), len(idle_instances), len(unclaimed_instances), len(executing_instances), len(terminated_time_instances), len(terminated_price_instances))) total_run_seconds = (ProvisionerConfig().simulate_time - ProvisionerConfig().sim_time).total_seconds() logger.debug("\nSIMULATION TIME OVERVIEW: start time (%s), " "current time (%s), seconds simulated (%s)" % (ProvisionerConfig().sim_time, ProvisionerConfig().simulate_time, total_run_seconds)) if total_run_seconds > self.kill_time: sys.exit() # Run through the jobs and set their states so they are # ignored by other things for t in self.tenants: for job in t.jobs: if job.id in self.finished_jobs: job.sim_status = "FINISHED" if job in t.idle_jobs: t.idle_jobs.remove(job) if job.id in self.executing_jobs: self.executing_jobs.remove(job.id) t.jobs.remove(job) elif job.id in self.executing_jobs: job.sim_status = "EXECUTING" if job in t.idle_jobs: t.idle_jobs.remove(job) # try cleaning up the instance state too for res in self.resources: if (res.state == "EXECUTING" and res.job_id not in self.executing_jobs): # somehow this one should have finished... # try to just wrap it up now res.job_id = None res.job_finish = None res.state = "IDLE" logger.debug("SIMULATION: Found an executing resource " + "that should be idle.")
def run_aws(self): """ This is the aws loop. Check if instances should be fulfilled etc. """ logger.debug("SIMULATION AWS: starting.") current_time = ProvisionerConfig().simulate_time logger.debug("SIMULATION AWS: running.") with self.lock: self.turn = 0 # check if any requests should be fulfilled for request in list(self.requests): if current_time >= request.ready_time: # start a resource for this insid = "%s-sim-ins-%s" % (ProvisionerConfig().run_name, self.insid) self.insid = self.insid + 1 logger.debug("SIMULATION AWS: creating a new resource " + "for request %s, has slept %s" % (request, request.sleep_time)) new_resource = SimResource( request.price, request.subnet, request.type, request.request_time, request.reqid, insid, random.choice(self.contextualise_time_dist), request.job_runner_id) self.resources = self.resources + [new_resource] # and remove the request since it is done self.instance_acquired(new_resource) self.requests.remove(request) # Now check to see if any instances should have booted by now. # This handles working out when the instance joins the HTCondor # queue and when jobs get dispatched. for resource in self.resources: # First check if the resource is in the contextualzing state. if (resource.state == 'CONTEXTUALIZING' and current_time >= resource.context_time): # Switch it over to Starting with a neg time resource.state = 'UNCLAIMED' wait_time = int(random.choice(self.negotiate_time_dist)) resource.claimed_time = current_time + \ datetime.timedelta(seconds=wait_time) elif resource.state == 'UNCLAIMED': # check if the timer has passed: logger.debug( 'SIMULATION: trying to become ' + 'unclaimed %s seconds remaining' % (resource.claimed_time - current_time).total_seconds()) if (resource.claimed_time - current_time).total_seconds() <= 0: # check if any jobs are in an idle state new_claim = self.check_claim(resource) if new_claim: resource.state = 'IDLE' logger.debug('Set resource to idle') continue else: # otherwise, set it back to 'starting so it # becomes unclaimed again' resource.claimed_time = current_time + \ datetime.timedelta(seconds=int( random.choice(self.negotiate_time_dist))) resource.state = 'UNCLAIMED' logger.debug( 'SIMULATION no idle job found, setting ' + 'back to UNCLAIMED') # check if any instances should terminate due to time terminate_resources = {} for resource in self.resources: if ProvisionerConfig().terminate == "hourly": if (resource.state != 'EXECUTING' and (int( (current_time - resource.launch_time).total_seconds() % 3600) > 3480) and resource.state != 'TERMINATED'): # now checking this when killing anything # over 3480 secs... logger.debug("SIMULATION AWS. Terminating resource " "due to time: %s" % resource) # terminate the job resource.reason = "time related" resource.state = "TERMINATED" resource.terminate_time = self.get_fake_time() elif ProvisionerConfig().terminate == "1hour": # now checking this when killing anything over 3480 secs... if (resource.state != 'EXECUTING' and (current_time - resource.launch_time).total_seconds() > 3480 and resource.state != 'TERMINATED'): logger.debug("SIMULATION AWS. Terminating resource " "due to time: %s" % resource) # terminate the job resource.reason = "time related" resource.state = "TERMINATED" resource.terminate_time = self.get_fake_time() elif ProvisionerConfig().terminate == "idle": if (resource.state == 'IDLE' and resource.state != 'TERMINATED' and (int((current_time - resource.launch_time).total_seconds()) > 600)): # now checking this when killing anything over # 3480 secs... logger.debug( "SIMULATION AWS. Terminating resource due " "to time: %s" % resource) # terminate the job resource.reason = "time related" resource.state = "TERMINATED" resource.terminate_time = self.get_fake_time() # Sort out the job that was running on this instance, # put it back to idle. if resource.state != "TERMINATED": # only do this every 10 seconds if (ProvisionerConfig().simulate_time - ProvisionerConfig().sim_time ).total_seconds() % 60 == 0: for t in self.tenants: if resource.type in terminate_resources: if terminate_resources[resource.type] is False: continue if (float(resource.price) < float( self.get_spot_prices(resource, t))): logger.debug( "SIMULATION: terminating resource " "due to price %s" % resource) for job in t.jobs: if job.id == resource.job_id: job.sim_status = 'IDLE' if job.id in self.executing_jobs: self.executing_jobs.remove(job.id) if job not in t.idle_jobs: t.idle_jobs = t.idle_jobs + [job] # now terminate the instance logger.debug( 'terminating instance due ' 'to price: %s %s' % (resource.price, self.get_spot_prices(resource, t))) resource.state = "TERMINATED" resource.reason = ("spot instance termination " "due to spot price") resource.terminate_time = self.get_fake_time() # self.resources.remove(resource) else: terminate_resources[resource.type] = False
def select_instance_type(self, instances): """ Select the instance to launch for each idle job. """ for tenant in self.tenants: for job in list(tenant.idle_jobs): if ProvisionerConfig().simulate: time.sleep(ProvisionerConfig().overhead_time) # Get the set of instance types that can be used for this job eligible_instances = self.restrict_instances(job) if len(eligible_instances) == 0: logger.error("Failed to find any eligible instances " "for job %s" % job) continue # get all potential pairs and sort them sorted_instances = self.get_potential_instances( eligible_instances, job, tenant) if len(sorted_instances) == 0: logger.error("Failed to find any sorted instances " "for job %s" % job) continue # work out if an ondemand instance is needed job.ondemand = self.check_ondemand_needed(tenant, sorted_instances, job) # If ondemand is required, redo the sorted list with only # ondemand requests and set that to be the launched instance if job.ondemand: sorted_instances = self.get_potential_instances( eligible_instances, job, tenant) job.launch = sorted_instances[0] logger.debug("Launching ondemand for this job. %s" % str(job.launch)) continue # otherwise we are now looking at launching a spot request # print out the options we are looking at self.print_cheapest_options(sorted_instances) # filter out a job if it has had too many requests made existing_requests = self.get_existing_requests(tenant, job) if len(existing_requests) >= ProvisionerConfig().max_requests: tenant.idle_jobs.remove(job) continue # Find the top request that hasn't already been requested # (e.g. zone+type pair is not in existing_requests) for req in sorted_instances: if len(existing_requests) > 0: # Skip this type if a matching request already # exists exists = False for existing in existing_requests: if (req.instance_type == existing.instance_type and req.zone == existing.zone): exists = True if exists: continue # Launch this type. # Hmm, this is getting more complciated with # multuiple provisioning models. if req.price < tenant.max_bid_price: req.bid = self.get_bid_price(job, tenant, req) job.launch = req job.cost_aware = req break else: logger.error(("Unable to launch request %s as " "the price is higher than max bid " "%s.") % (str(req), tenant.max_bid_price))
else: last = True logger.debug("No prediction long enough in " "%s, using last one. %s %s" % (addr, time, cost)) except Exception, z: logger.error("oracle: failed here: %s %s" % (z, line)) # Split the line in half to get the time and cost if last or float(time) > (float(job.duration) / 3600): # this is the one we want to use ret_oracle = Decimal(str(cost)) break return ret_drafts, ret_oracle except Exception, e: logger.debug("Failed to find DrAFTS price for %s. %s" % (ins, e)) return None, None def print_cheapest_options(self, sorted_instances): # Print out the top three logger.info("Top three to select from:") top_three = 3 for ins in sorted_instances: if top_three == 0: break if ProvisionerConfig().DrAFTS: logger.info("DrAFTS: %s %s %s %s" % (ins.instance_type, ins.zone, ins.price, ins.DrAFTS)) if ProvisionerConfig().DrAFTSAvgPrice: logger.info("DrAFTS Oracle Price: %s %s %s %s" % (ins.instance_type, ins.zone, ins.price, ins.OraclePrice))
def run(self): """ Run the provisioner. This should execute periodically and determine what actions need to be taken. """ self.run_iterations = 0 # self.simulate = False if ProvisionerConfig().simulate: self.sched = SimScheduler() ProvisionerConfig().load_instance_types() self.load_drafts_data() while True: self.run_iterations = self.run_iterations + 1 # Load jobs t1 = datetime.datetime.now() start_time = datetime.datetime.now() self.load_tenants_and_jobs() t2 = datetime.datetime.now() # Simulate the world (mostly tidy things up and print stats) ProvisionerConfig().simulator.simulate(self.tenants) t3 = datetime.datetime.now() self.sched.process_idle_jobs(self.tenants) tx = datetime.datetime.now() # Simulate Condor ProvisionerConfig().simulator.run_condor(self.tenants) t4 = datetime.datetime.now() # Simulate AWS ProvisionerConfig().simulator.run_aws() t5 = datetime.datetime.now() # Check if it should finish executing (e.g. jobs and # resources all terminated) if ProvisionerConfig().simulator.check_finished(): break self.manage_resources() t6 = datetime.datetime.now() if ((ProvisionerConfig().simulate_time - ProvisionerConfig().sim_time).total_seconds() % ProvisionerConfig().run_rate == 0): self.provision_resources() t7 = datetime.datetime.now() load_time = (t2 - t1).total_seconds() sim_time = (t3 - t2).total_seconds() proc_idle_time = (tx - t3).total_seconds() condor_time = (t4 - tx).total_seconds() aws_time = (t5 - t4).total_seconds() manage_time = (t6 - t5).total_seconds() prov_time = (t7 - t6).total_seconds() # Otherwise, step through time ProvisionerConfig().simulate_time = ProvisionerConfig( ).simulate_time + datetime.timedelta(seconds=2) logger.debug("RUN ID: %s. SIMULATION: advancing time " "2 second" % ProvisionerConfig().run_id) logger.debug("SIMULATION times: load (%s), sim (%s)," " proc_idle (%s), condor (%s), aws (%s)," " manage (%s), prov (%s)" % ( load_time, sim_time, proc_idle_time, condor_time, aws_time, manage_time, prov_time)) else: self.sched = CondorScheduler() while True: self.run_iterations = self.run_iterations + 1 # Get the tenants from the database and process the current # condor_q. Also assign those jobs to each tenant. start_time = datetime.datetime.now() self.load_tenants_and_jobs() # provisioning will fail if there are no tenants if len(self.tenants) > 0: # Handle all of the existing requests. This will cancel # or migrate excess requests and update the database to # reflect the state of the environment self.manage_resources() # Work out the price for each instance type and acquire # resources for jobs self.provision_resources() # wait "run_rate" seconds before trying again end_time = datetime.datetime.now() diff = (end_time - start_time).total_seconds() logger.debug("SCRIMP (SIMULATION) run loop: " "%s seconds. Now sleeping %s seconds." % ( diff, ProvisionerConfig().run_rate)) if diff < ProvisionerConfig().run_rate: time.sleep(ProvisionerConfig().run_rate - diff)