def run(self): self.cloud_name = self.group['cloud'] if self.group['failure_rate'] == "None": # No failures, shut the simulator LOG.info("Failure-Simulator-%s: failure rate is set to None. Terminating simulator" % (self.cloud_name)) self.stop_event.set() return self.failute_rate = float(self.group['failure_rate']) self.interval = random.expovariate(self.failute_rate) while(not self.stop_event.is_set()): LOG.info("Failure-Simulator-%s: sleeping for %d sec" % (self.cloud_name, self.interval)) self.stop_event.wait(self.interval) list_of_vms = self.get_cloud_termination_list() # continue as normal count = len(list_of_vms) if count > 0: pick = random.randint(0, count-1) instance = list_of_vms[pick] LOG.info("Failure-Simulator-%s: terminating an instance %s (%s)" % (self.cloud_name, instance.id, instance.public_dns_name)) filelog(self.config.failure_log, "%s,TERMINATED,%s,%s" % (time.time(), self.cloud_name, instance.public_dns_name)) worker = Worker(self.config, instance) worker.terminate() # terminates condor daemon and shuts down instance # Reset the sleep time (interval before the next failure) self.interval = random.expovariate(self.failute_rate)
def execute(self): filelog(self.remote_log, "Host: %s, User: %s, CMD: %s" % (self.hostname, self.user, self.command)) while self.retry_count <= self.retry_limit: if os.path.isfile(self.ssh_private_key): context = fabric_api.settings(fabric_api.hide('running', 'stdout', 'stderr', 'warnings'), user=self.user, key_filename=[].append(self.ssh_private_key), disable_known_hosts=True, linewise=True, warn_only=True, abort_on_prompts=True, always_use_pty=True, timeout=5, use_ssh_config=True) else: LOG.error("Path to ssh private key is invalid") return None try: #print "my cache state is %s" % (str(state.connections)) for host_key in state.connections.keys(): state.connections.pop(host_key) except Exception as ex: print "Exception in dealing with fabric cache %s " % (str(ex)) if context: with context: try: fabric_api.env.host_string = self.hostname results = fabric_api.run(self.command) self.stdout = results.stdout self.stderr = results.stderr filelog(self.remote_log, "Error: %s" % (self.stderr)) filelog(self.remote_log, "Output: %s" % (self.stdout)) #print "return code from command %s is %s" % (self.command, str(results.return_code)) #print "stderr : %s" % (self.stderr) return results.return_code except Exception as exptErr: self.retry_count +=1 errmsg = str(exptErr) LOG.info("Exception in running remote command: %s" % (errmsg)) time.sleep(self.retry_interval) LOG.info("Trying to execute remote command again. Retry: %d/%d" % (self.retry_count, self.retry_limit)) else: LOG.error("Problem occurred while initializing fabric context") return None LOG.error("Could not execute remote command. Number of retries exceeded the limit") return None
def run(self): LOG.info("Activating AD. Sleep period: %d sec" % (self.interval)) jobs = Jobs(self.config, self.master.dns) while(not self.stop_event.is_set()): self.stop_event.wait(self.interval) curr_dict = self.get_current_dict() jobs.update_current_list() pool_dict_str = "%s," % (time.time()) for cloud_name, instance_count in curr_dict.iteritems(): pool_dict_str += "%s:%d," % (cloud_name,instance_count) pool_dict_str = pool_dict_str[:-1] filelog(self.config.worker_pool_log, pool_dict_str) diff_dict = {} for cloud_name in curr_dict: up_diff = self.desired_dict[cloud_name] - curr_dict[cloud_name] diff_dict[cloud_name] = up_diff for cloud_name in curr_dict: if curr_dict[cloud_name] > self.desired_dict[cloud_name]: LOG.info("Downscaling in %s" % (cloud_name)) down_diff = - diff_dict[cloud_name] candidates = self.get_cloud_instances_by_runtime_inc(cloud_name, jobs) termination_list = self.select_from_candidates(cloud_name, candidates, down_diff) for atuple in termination_list: instance_id = atuple[0] running = atuple[1] instance_info = atuple[2] dns = instance_info['public_dns'] LOG.info("AD terminated instance %s in %s" % (cloud_name, instance_id)) filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%s" % (cloud_name, dns, running)) filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s" % (cloud_name, instance_id, dns)) LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity)) Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns) self.phantom_client.terminate_instance(instance_id) LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity)) # figure out where to up scale # sort the diff dict to find cloud with max number of lack isntances ( up scale ) # [('c', 10), ('a', 3), ('b', 1)] # we sort dict by value, it returns list of tuples sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True) if sorted_diff_dict[0][1] > 0: cloud_to_upscale = sorted_diff_dict[0][0] if cloud_to_upscale != cloud_name: # create new tag : current_cloud_tag = self.phantom_client.cloud_list.split(",") new_cloud_tag = "" new_cloud_count = 0 LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list)) LOG.info("Current dict is %s" % (str(curr_dict))) LOG.info("Diff dict is %s" % (str(diff_dict))) for each_cloud in current_cloud_tag: tmp_cloud_name = each_cloud.split(":")[0] tmp_cloud_count = int(each_cloud.split(":")[1]) if tmp_cloud_name == cloud_to_upscale: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1) curr_dict[tmp_cloud_name] += 1 diff_dict[tmp_cloud_name] -= 1 elif tmp_cloud_name == cloud_name: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1) curr_dict[tmp_cloud_name] -= 1 diff_dict[tmp_cloud_name] += 1 else: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count) new_cloud_count += curr_dict[tmp_cloud_name] new_cloud_tag_no_comma = new_cloud_tag[:-1] LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma)) LOG.info("New Current dict is %s" % (str(curr_dict))) LOG.info("New Diff dict is %s" % (str(diff_dict))) LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count)) self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count) self.phantom_client.cloud_list = new_cloud_tag_no_comma self.phantom_client.asg.set_capacity(new_cloud_count) else: LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED")
def run(self): LOG.info("Activating OO. Sleep period: %d sec" % (self.interval)) jobs = Jobs(self.config, self.master.dns) while(not self.stop_event.is_set()): self.stop_event.wait(self.interval) # Figure out what type of iteration this is: # - either when the counter equals the limit -- then try marking nodes offline # - any other iteration -- then only terminate idle instances if any in the marked_offline_list self.check_within_interval_counter += 1 if self.check_within_interval_counter == self.self.check_within_interval_limit: allow_marking_offline = True # Rest and go back to the beginning of the cycle self.check_within_interval_counter = 0 LOG.info("OI's iteration with marking nodes offline and termination of idle instances") else: allow_marking_offline = False LOG.info("OI's iteration with termination of previously marked instances") curr_dict = self.get_current_dict() jobs.update_current_list() pool_dict_str = "%s," % (time.time()) for cloud_name, instance_count in curr_dict.iteritems(): pool_dict_str += "%s:%d," % (cloud_name,instance_count) pool_dict_str = pool_dict_str[:-1] filelog(self.config.worker_pool_log, pool_dict_str) diff_dict = {} for cloud_name in curr_dict: up_diff = self.desired_dict[cloud_name] - curr_dict[cloud_name] diff_dict[cloud_name] = up_diff for cloud_name in curr_dict: if curr_dict[cloud_name] > self.desired_dict[cloud_name]: LOG.info("Downscaling in %s" % (cloud_name)) down_diff = - diff_dict[cloud_name] if not allow_marking_offline: # give me all idle_instances that are in self.marked_offline_list # only these instances are allowed to be terminated if self.marked_offline_list: candidates = self.get_candidates(cloud_name, jobs, down_diff, return_only_all_idle=True) idle_candidates = [] for cand in candidates: ins_id = cand[0] if tuple_id in self.marked_offline_list: idle_candidates.append(ins_id) LOG.info("Selecting idle offline instance for termination: %s" % (ins_id)) self.marked_offline_list.remove(ins_id) else: idle_candidates = [] else: idle_candidates, nonidle_candidates = self.get_candidates(cloud_name, jobs, down_diff, return_only_all_idle=False) for instance_tuple in idle_candidates: instance_id = instance_tuple[0] instance_info = instance_tuple[1] dns = instance_info['public_dns'] LOG.info("OO terminated idle instance %s in %s" % (cloud_name, instance_id)) filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%d" % (cloud_name, dns, 0)) filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s" % (cloud_name, instance_id, dns)) LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity)) Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns) self.phantom_client.terminate_instance(instance_id) LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity)) # upscale sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True) if sorted_diff_dict[0][1] > 0: cloud_to_upscale = sorted_diff_dict[0][0] if cloud_to_upscale != cloud_name: # create new tag : current_cloud_tag = self.phantom_client.cloud_list.split(",") new_cloud_tag = "" new_cloud_count = 0 LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list)) LOG.info("Current dict is %s" % (str(curr_dict))) LOG.info("Diff dict is %s" % (str(diff_dict))) for each_cloud in current_cloud_tag: tmp_cloud_name = each_cloud.split(":")[0] tmp_cloud_count = int(each_cloud.split(":")[1]) if tmp_cloud_name == cloud_to_upscale: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1) curr_dict[tmp_cloud_name] += 1 diff_dict[tmp_cloud_name] -= 1 elif tmp_cloud_name == cloud_name: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1) curr_dict[tmp_cloud_name] -= 1 diff_dict[tmp_cloud_name] += 1 else: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count) new_cloud_count += curr_dict[tmp_cloud_name] new_cloud_tag_no_comma = new_cloud_tag[:-1] LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma)) LOG.info("New Current dict is %s" % (str(curr_dict))) LOG.info("New Diff dict is %s" % (str(diff_dict))) LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count)) self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count) self.phantom_client.cloud_list = new_cloud_tag_no_comma self.phantom_client.asg.set_capacity(new_cloud_count) else: LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED") if allow_marking_offline: for instance_tuple in nonidle_candidates: instance_id = instance_tuple[0] instance_info = instance_tuple[1] dns = instance_info['public_dns'] LOG.info("OO marked instance offline %s in %s" % (cloud_name, instance_id)) filelog(self.config.node_log, "OFFLINED WORKER cloud: %s, instance: %s, dns: %s" % (cloud_name, instance_id, dns)) filelog(self.config.discarded_work_log, "OFFLINE,%s,%s,%d" % (cloud_name, dns, 0)) worker = Worker(self.config, instance_id, instance_info) worker.offline(self.master.dns) # marks node offline (it later becomes idle and get terminated) self.marked_offline_list.append(instance_id)
def run(self): LOG.info("Activating OI. Sleep period: %d sec" % (self.interval)) jobs = Jobs(self.config, self.master.dns) while(not self.stop_event.is_set()): self.stop_event.wait(self.interval) curr_dict = self.get_current_dict() jobs.update_current_list() pool_dict_str = "%s," % (time.time()) for cloud_name, instance_count in curr_dict.iteritems(): pool_dict_str += "%s:%d," % (cloud_name,instance_count) pool_dict_str = pool_dict_str[:-1] filelog(self.config.worker_pool_log, pool_dict_str) diff_dict = {} for cloud_name in curr_dict: up_diff = self.desired_dict[cloud_name] - curr_dict[cloud_name] diff_dict[cloud_name] = up_diff for cloud_name in curr_dict: if curr_dict[cloud_name] > self.desired_dict[cloud_name]: down_diff = - diff_dict[cloud_name] candidates = self.get_idle_instances(cloud_name, jobs) # Only terminate as many as needed termination_list = candidates[:down_diff] if termination_list: LOG.info("Downscaling in %s" % (cloud_name)) else: LOG.info("Not Downscaling because no idle instances found in %s" % (cloud_name)) for instance_tuple in termination_list: instance_id = instance_tuple[0] instance_info = instance_tuple[1] dns = instance_info['public_dns'] LOG.info("OI terminated instance %s in %s" % (cloud_name, instance_id)) filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%d" % (cloud_name, dns, 0)) filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s" % (cloud_name, instance_id, dns)) LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity)) Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns) self.phantom_client.terminate_instance(instance_id) LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity)) # upscale sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True) if sorted_diff_dict[0][1] > 0: cloud_to_upscale = sorted_diff_dict[0][0] if cloud_to_upscale != cloud_name: # create new tag : current_cloud_tag = self.phantom_client.cloud_list.split(",") new_cloud_tag = "" new_cloud_count = 0 LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list)) LOG.info("Current dict is %s" % (str(curr_dict))) LOG.info("Diff dict is %s" % (str(diff_dict))) for each_cloud in current_cloud_tag: tmp_cloud_name = each_cloud.split(":")[0] tmp_cloud_count = int(each_cloud.split(":")[1]) if tmp_cloud_name == cloud_to_upscale: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1) curr_dict[tmp_cloud_name] += 1 diff_dict[tmp_cloud_name] -= 1 elif tmp_cloud_name == cloud_name: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1) curr_dict[tmp_cloud_name] -= 1 diff_dict[tmp_cloud_name] += 1 else: new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count) new_cloud_count += curr_dict[tmp_cloud_name] new_cloud_tag_no_comma = new_cloud_tag[:-1] LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma)) LOG.info("New Current dict is %s" % (str(curr_dict))) LOG.info("New Diff dict is %s" % (str(diff_dict))) LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count)) self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count) self.phantom_client.cloud_list = new_cloud_tag_no_comma self.phantom_client.asg.set_capacity(new_cloud_count) else: LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED")
def __init__(self, config, clouds): self.config = config self.cloud = clouds.lookup_by_name(config.master.cloud) if self.cloud == None: LOG.error('Can\'t find a cloud "%s" specified for the master node' % (config.master.cloud)) sys.exit(1) decision_made = False create = True while decision_made == False: # input = raw_input( "Create a new master node or reuse existing? (C/R)\n" ) input = "Create" if input == "C" or input == "c" or input == "Create" or input == "create": create = True decision_made = True elif input == "R" or input == "r" or input == "Reuse" or input == "reuse": create = False decision_made = True else: print("Invalid input. Please try again.\n") if create: LOG.info("Master node is going to be created in the cloud: %s" % (config.master.cloud)) self.reservation = self.cloud.boot_image(config.master.image_id, count=1, type=config.master.instance_type) self.sleep_until_master_ready() self.determine_dns() filelog( self.config.node_log, "CREATED MASTER cloud: %s, instance: %s, dns: %s" % (self.cloud.name, self.instance_id, self.dns), ) else: # Reusing existing master node LOG.info( 'One of the existing instances in cloud "%s" is going to be reused as a master node' % (self.cloud.name) ) self.cloud.connect() master_selected = False while master_selected == False: for reservation in self.cloud.conn.get_all_instances(): instances = reservation.instances if len(instances) != 1: LOG.info('Skipping reservation "%s" since it has more than one instance' % (reservation.id)) continue instance = instances[0] printfile(self.config.node_log, "Log entries for instance %s:" % instance.id, instance.id) select_instance = raw_input( 'Select instance "%s" of reservation "%s" in cloud "%s" as a master node? (Y/N)\n' % (instance.id, reservation.id, self.cloud.name) ) if is_yes(select_instance): LOG.info( "Master node has been selected. Instance: %s, Reservation: %s, Cloud: %s" % (instance.id, reservation.id, self.cloud.name) ) master_selected = True self.reservation = reservation self.determine_dns() filelog( self.config.node_log, "REUSED MASTER cloud: %s, reservation: %s, instance: %s, dns: %s" % (self.cloud.name, self.reservation.id, self.instance_id, self.dns), ) break if master_selected == False: print("Master node has not been selected. Looping through the list of existing reservations again.")