Exemplo n.º 1
0
    def run(self):

        self.cloud_name = self.group['cloud']

        if self.group['failure_rate'] == "None":
            # No failures, shut the simulator
            LOG.info("Failure-Simulator-%s: failure rate is set to None. Terminating simulator" % (self.cloud_name))
            self.stop_event.set()
            return

        self.failute_rate = float(self.group['failure_rate'])
        self.interval = random.expovariate(self.failute_rate)

        while(not self.stop_event.is_set()):
            LOG.info("Failure-Simulator-%s: sleeping for %d sec" % (self.cloud_name, self.interval))
            self.stop_event.wait(self.interval)

            list_of_vms = self.get_cloud_termination_list()

            # continue as normal
            count = len(list_of_vms)
            if count > 0:
                pick = random.randint(0, count-1)
                instance = list_of_vms[pick]

                LOG.info("Failure-Simulator-%s: terminating an instance %s (%s)"
                         % (self.cloud_name, instance.id, instance.public_dns_name))
                filelog(self.config.failure_log, "%s,TERMINATED,%s,%s"
                                                 % (time.time(), self.cloud_name, instance.public_dns_name))

                worker = Worker(self.config, instance)
                worker.terminate() # terminates condor daemon and shuts down instance

                # Reset the sleep time (interval before the next failure)
                self.interval = random.expovariate(self.failute_rate)
Exemplo n.º 2
0
    def execute(self):

        filelog(self.remote_log, "Host: %s, User: %s, CMD: %s" %
                                 (self.hostname, self.user, self.command))

        while self.retry_count <= self.retry_limit:

            if os.path.isfile(self.ssh_private_key):
                context = fabric_api.settings(fabric_api.hide('running', 'stdout', 'stderr', 'warnings'),
                    user=self.user,
                    key_filename=[].append(self.ssh_private_key),
                    disable_known_hosts=True,
                    linewise=True,
                    warn_only=True,
                    abort_on_prompts=True,
                    always_use_pty=True,
                    timeout=5,
                    use_ssh_config=True)
            else:
                LOG.error("Path to ssh private key is invalid")
                return None
            try:
                #print "my cache state is %s" % (str(state.connections))
                for host_key in state.connections.keys():
                    state.connections.pop(host_key)
            except Exception as ex:
                print "Exception in dealing with fabric cache %s " % (str(ex))


            if context:
                with context:
                    try:
                        fabric_api.env.host_string = self.hostname
                        results = fabric_api.run(self.command)
                        self.stdout = results.stdout
                        self.stderr = results.stderr
                        filelog(self.remote_log, "Error: %s" % (self.stderr))
                        filelog(self.remote_log, "Output: %s" % (self.stdout))
                        #print "return code from command %s is %s" % (self.command, str(results.return_code))
                        #print "stderr : %s" % (self.stderr)
                        return results.return_code
                    except Exception as exptErr:
                        self.retry_count +=1
                        errmsg = str(exptErr)
                        LOG.info("Exception in running remote command: %s" % (errmsg))
                        time.sleep(self.retry_interval)
                        LOG.info("Trying to execute remote command again. Retry: %d/%d" % (self.retry_count, self.retry_limit))

            else:
                LOG.error("Problem occurred while initializing fabric context")
                return None

        LOG.error("Could not execute remote command. Number of retries exceeded the limit")
        return None
Exemplo n.º 3
0
    def run(self):

        LOG.info("Activating AD. Sleep period: %d sec" % (self.interval))
        jobs = Jobs(self.config, self.master.dns)
        while(not self.stop_event.is_set()):
            self.stop_event.wait(self.interval)

            curr_dict = self.get_current_dict()
            jobs.update_current_list()

            pool_dict_str = "%s," % (time.time())
            for cloud_name, instance_count in curr_dict.iteritems():
                pool_dict_str += "%s:%d," % (cloud_name,instance_count)
            pool_dict_str = pool_dict_str[:-1]
            filelog(self.config.worker_pool_log, pool_dict_str)

            diff_dict = {}

            for cloud_name in curr_dict:
                up_diff =  self.desired_dict[cloud_name] - curr_dict[cloud_name]
                diff_dict[cloud_name] = up_diff

            for cloud_name in curr_dict:
                if curr_dict[cloud_name] > self.desired_dict[cloud_name]:
                    LOG.info("Downscaling in %s" % (cloud_name))
                    down_diff = - diff_dict[cloud_name]
                    candidates = self.get_cloud_instances_by_runtime_inc(cloud_name, jobs)
                    termination_list = self.select_from_candidates(cloud_name, candidates, down_diff)
                    for atuple in termination_list:
                        instance_id = atuple[0]
                        running = atuple[1]
                        instance_info = atuple[2]

                        dns = instance_info['public_dns']

                        LOG.info("AD terminated instance %s in %s" % (cloud_name, instance_id))
                        filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%s" % (cloud_name, dns, running))
                        filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s"
                                                      % (cloud_name, instance_id, dns))

                        LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity))
                        Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns)
                        self.phantom_client.terminate_instance(instance_id)
                        LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity))

                        # figure out where to up scale

                        # sort the diff dict to find cloud with max number of lack isntances ( up scale )
                        # [('c', 10), ('a', 3), ('b', 1)]
                        # we sort dict by value, it returns list of tuples
                        sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
                        if sorted_diff_dict[0][1] > 0:
                            cloud_to_upscale = sorted_diff_dict[0][0]


                            if cloud_to_upscale != cloud_name:
                                # create new tag :
                                current_cloud_tag = self.phantom_client.cloud_list.split(",")
                                new_cloud_tag = ""
                                new_cloud_count = 0
                                LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list))
                                LOG.info("Current dict is %s" % (str(curr_dict)))
                                LOG.info("Diff dict is %s" % (str(diff_dict)))
                                for each_cloud in current_cloud_tag:
                                    tmp_cloud_name = each_cloud.split(":")[0]
                                    tmp_cloud_count = int(each_cloud.split(":")[1])
                                    if tmp_cloud_name == cloud_to_upscale:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1)
                                        curr_dict[tmp_cloud_name] += 1
                                        diff_dict[tmp_cloud_name] -= 1
                                    elif tmp_cloud_name == cloud_name:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1)
                                        curr_dict[tmp_cloud_name] -= 1
                                        diff_dict[tmp_cloud_name] += 1
                                    else:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count)
                                    new_cloud_count += curr_dict[tmp_cloud_name]

                                new_cloud_tag_no_comma = new_cloud_tag[:-1]
                                LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma))
                                LOG.info("New Current dict is %s" % (str(curr_dict)))
                                LOG.info("New Diff dict is %s" % (str(diff_dict)))
                                LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count))

                                self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count)
                                self.phantom_client.cloud_list = new_cloud_tag_no_comma
                                self.phantom_client.asg.set_capacity(new_cloud_count)
                            else:
                                LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED")
Exemplo n.º 4
0
    def run(self):

        LOG.info("Activating OO. Sleep period: %d sec" % (self.interval))
        jobs = Jobs(self.config, self.master.dns)

        while(not self.stop_event.is_set()):

            self.stop_event.wait(self.interval)

            # Figure out what type of iteration this is:
            # - either when the counter equals the limit -- then try marking nodes offline
            # - any other iteration -- then only terminate idle instances if any in the marked_offline_list
            self.check_within_interval_counter += 1
            if self.check_within_interval_counter == self.self.check_within_interval_limit:
                allow_marking_offline = True
                # Rest and go back to the beginning of the cycle
                self.check_within_interval_counter = 0
                LOG.info("OI's iteration with marking nodes offline and termination of idle instances")
            else:
                allow_marking_offline = False
                LOG.info("OI's iteration with termination of previously marked instances")

            curr_dict = self.get_current_dict()
            jobs.update_current_list()

            pool_dict_str = "%s," % (time.time())
            for cloud_name, instance_count in curr_dict.iteritems():
                pool_dict_str += "%s:%d," % (cloud_name,instance_count)
            pool_dict_str = pool_dict_str[:-1]
            filelog(self.config.worker_pool_log, pool_dict_str)

            diff_dict = {}

            for cloud_name in curr_dict:
                up_diff =  self.desired_dict[cloud_name] - curr_dict[cloud_name]
                diff_dict[cloud_name] = up_diff


            for cloud_name in curr_dict:
                if curr_dict[cloud_name] > self.desired_dict[cloud_name]:
                    LOG.info("Downscaling in %s" % (cloud_name))
                    down_diff = - diff_dict[cloud_name]

                    if not allow_marking_offline:
                        # give me all idle_instances that are in self.marked_offline_list
                        # only these instances are allowed to be terminated
                        if self.marked_offline_list:
                            candidates = self.get_candidates(cloud_name, jobs, down_diff, return_only_all_idle=True)
                            idle_candidates = []
                            for cand in candidates:
                                ins_id = cand[0]
                                if tuple_id in self.marked_offline_list:
                                    idle_candidates.append(ins_id)
                                    LOG.info("Selecting idle offline instance for termination: %s" % (ins_id))
                                    self.marked_offline_list.remove(ins_id)
                        else:
                            idle_candidates = []
                    else:
                        idle_candidates, nonidle_candidates = self.get_candidates(cloud_name, jobs, down_diff, return_only_all_idle=False)

                    for instance_tuple in idle_candidates:
                        instance_id = instance_tuple[0]
                        instance_info = instance_tuple[1]
                        dns = instance_info['public_dns']
                        LOG.info("OO terminated idle instance %s in %s" % (cloud_name, instance_id))
                        filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%d" % (cloud_name, dns, 0))
                        filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s"
                                                      % (cloud_name, instance_id, dns))

                        LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity))
                        Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns)
                        self.phantom_client.terminate_instance(instance_id)
                        LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity))

                        # upscale
                        sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
                        if sorted_diff_dict[0][1] > 0:
                            cloud_to_upscale = sorted_diff_dict[0][0]


                            if cloud_to_upscale != cloud_name:
                                # create new tag :
                                current_cloud_tag = self.phantom_client.cloud_list.split(",")
                                new_cloud_tag = ""
                                new_cloud_count = 0
                                LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list))
                                LOG.info("Current dict is %s" % (str(curr_dict)))
                                LOG.info("Diff dict is %s" % (str(diff_dict)))
                                for each_cloud in current_cloud_tag:
                                    tmp_cloud_name = each_cloud.split(":")[0]
                                    tmp_cloud_count = int(each_cloud.split(":")[1])
                                    if tmp_cloud_name == cloud_to_upscale:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1)
                                        curr_dict[tmp_cloud_name] += 1
                                        diff_dict[tmp_cloud_name] -= 1
                                    elif tmp_cloud_name == cloud_name:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1)
                                        curr_dict[tmp_cloud_name] -= 1
                                        diff_dict[tmp_cloud_name] += 1
                                    else:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count)
                                    new_cloud_count += curr_dict[tmp_cloud_name]

                                new_cloud_tag_no_comma = new_cloud_tag[:-1]
                                LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma))
                                LOG.info("New Current dict is %s" % (str(curr_dict)))
                                LOG.info("New Diff dict is %s" % (str(diff_dict)))
                                LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count))

                                self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count)
                                self.phantom_client.cloud_list = new_cloud_tag_no_comma
                                self.phantom_client.asg.set_capacity(new_cloud_count)
                            else:
                                LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED")

                    if allow_marking_offline:
                        for instance_tuple in nonidle_candidates:
                            instance_id = instance_tuple[0]
                            instance_info = instance_tuple[1]
                            dns = instance_info['public_dns']
                            LOG.info("OO marked instance offline %s in %s" % (cloud_name, instance_id))
                            filelog(self.config.node_log, "OFFLINED WORKER cloud: %s, instance: %s, dns: %s"
                                                          % (cloud_name, instance_id, dns))
                            filelog(self.config.discarded_work_log, "OFFLINE,%s,%s,%d" % (cloud_name, dns, 0))

                            worker = Worker(self.config, instance_id, instance_info)
                            worker.offline(self.master.dns) # marks node offline (it later becomes idle and get terminated)
                            self.marked_offline_list.append(instance_id)
Exemplo n.º 5
0
    def run(self):

        LOG.info("Activating OI. Sleep period: %d sec" % (self.interval))
        jobs = Jobs(self.config, self.master.dns)
        while(not self.stop_event.is_set()):
            self.stop_event.wait(self.interval)

            curr_dict = self.get_current_dict()
            jobs.update_current_list()


            pool_dict_str = "%s," % (time.time())
            for cloud_name, instance_count in curr_dict.iteritems():
                pool_dict_str += "%s:%d," % (cloud_name,instance_count)
            pool_dict_str = pool_dict_str[:-1]
            filelog(self.config.worker_pool_log, pool_dict_str)

            diff_dict = {}

            for cloud_name in curr_dict:
                up_diff =  self.desired_dict[cloud_name] - curr_dict[cloud_name]
                diff_dict[cloud_name] = up_diff

            for cloud_name in curr_dict:
                if curr_dict[cloud_name] > self.desired_dict[cloud_name]:

                    down_diff = - diff_dict[cloud_name]
                    candidates = self.get_idle_instances(cloud_name, jobs)

                    # Only terminate as many as needed
                    termination_list = candidates[:down_diff]
                    if termination_list:
                        LOG.info("Downscaling in %s" % (cloud_name))
                    else:
                        LOG.info("Not Downscaling because no idle instances found in %s" % (cloud_name))

                    for instance_tuple in termination_list:
                        instance_id = instance_tuple[0]
                        instance_info = instance_tuple[1]
                        dns = instance_info['public_dns']
                        LOG.info("OI terminated instance %s in %s" % (cloud_name, instance_id))
                        filelog(self.config.discarded_work_log, "DISCARDED,%s,%s,%d" % (cloud_name, dns, 0))
                        filelog(self.config.node_log, "TERMINATED WORKER cloud: %s, instance: %s, dns: %s"
                                                      % (cloud_name, instance_id, dns))

                        LOG.info("Desired capacity (before termination) is %d" % (self.phantom_client.asg.desired_capacity))

                        Worker(self.config, instance_id, instance_info).terminate_condor(self.master.dns)
                        self.phantom_client.terminate_instance(instance_id)

                        LOG.info("Desired capacity (after termination) is %d" % (self.phantom_client.asg.desired_capacity))

                        # upscale

                        sorted_diff_dict = sorted(diff_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
                        if sorted_diff_dict[0][1] > 0:
                            cloud_to_upscale = sorted_diff_dict[0][0]
                            if cloud_to_upscale != cloud_name:
                            # create new tag :
                                current_cloud_tag = self.phantom_client.cloud_list.split(",")
                                new_cloud_tag = ""
                                new_cloud_count = 0
                                LOG.info("Current cloud tag is %s" % (self.phantom_client.cloud_list))
                                LOG.info("Current dict is %s" % (str(curr_dict)))
                                LOG.info("Diff dict is %s" % (str(diff_dict)))
                                for each_cloud in current_cloud_tag:
                                    tmp_cloud_name = each_cloud.split(":")[0]
                                    tmp_cloud_count = int(each_cloud.split(":")[1])
                                    if tmp_cloud_name == cloud_to_upscale:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count +1)
                                        curr_dict[tmp_cloud_name] += 1
                                        diff_dict[tmp_cloud_name] -= 1
                                    elif tmp_cloud_name == cloud_name:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count - 1)
                                        curr_dict[tmp_cloud_name] -= 1
                                        diff_dict[tmp_cloud_name] += 1
                                    else:
                                        new_cloud_tag += "%s:%d," % (tmp_cloud_name, tmp_cloud_count)
                                    new_cloud_count += curr_dict[tmp_cloud_name]

                                new_cloud_tag_no_comma = new_cloud_tag[:-1]
                                LOG.info("New cloud tag is %s" % (new_cloud_tag_no_comma))
                                LOG.info("New Current dict is %s" % (str(curr_dict)))
                                LOG.info("New Diff dict is %s" % (str(diff_dict)))
                                LOG.info("New Desired capacity (after recounting) is %d" % (new_cloud_count))

                                self.phantom_client.update_tags(new_cloud_tag_no_comma, new_cloud_count)
                                self.phantom_client.cloud_list = new_cloud_tag_no_comma
                                self.phantom_client.asg.set_capacity(new_cloud_count)
                            else:
                                LOG.info("Trying to upscale and downscale in the same cloud .. STOPPED")
Exemplo n.º 6
0
    def __init__(self, config, clouds):
        self.config = config
        self.cloud = clouds.lookup_by_name(config.master.cloud)
        if self.cloud == None:
            LOG.error('Can\'t find a cloud "%s" specified for the master node' % (config.master.cloud))
            sys.exit(1)

        decision_made = False
        create = True
        while decision_made == False:

            # input = raw_input( "Create a new master node or reuse existing? (C/R)\n" )
            input = "Create"

            if input == "C" or input == "c" or input == "Create" or input == "create":
                create = True
                decision_made = True
            elif input == "R" or input == "r" or input == "Reuse" or input == "reuse":
                create = False
                decision_made = True
            else:
                print("Invalid input. Please try again.\n")

        if create:
            LOG.info("Master node is going to be created in the cloud: %s" % (config.master.cloud))
            self.reservation = self.cloud.boot_image(config.master.image_id, count=1, type=config.master.instance_type)
            self.sleep_until_master_ready()
            self.determine_dns()
            filelog(
                self.config.node_log,
                "CREATED MASTER cloud: %s, instance: %s, dns: %s" % (self.cloud.name, self.instance_id, self.dns),
            )
        else:
            # Reusing existing master node

            LOG.info(
                'One of the existing instances in cloud "%s" is going to be reused as a master node' % (self.cloud.name)
            )
            self.cloud.connect()
            master_selected = False
            while master_selected == False:

                for reservation in self.cloud.conn.get_all_instances():
                    instances = reservation.instances
                    if len(instances) != 1:
                        LOG.info('Skipping reservation "%s" since it has more than one instance' % (reservation.id))
                        continue
                    instance = instances[0]
                    printfile(self.config.node_log, "Log entries for instance %s:" % instance.id, instance.id)
                    select_instance = raw_input(
                        'Select instance "%s" of reservation "%s" in cloud "%s" as a master node? (Y/N)\n'
                        % (instance.id, reservation.id, self.cloud.name)
                    )

                    if is_yes(select_instance):
                        LOG.info(
                            "Master node has been selected. Instance: %s, Reservation: %s, Cloud: %s"
                            % (instance.id, reservation.id, self.cloud.name)
                        )
                        master_selected = True
                        self.reservation = reservation
                        self.determine_dns()

                        filelog(
                            self.config.node_log,
                            "REUSED MASTER cloud: %s, reservation: %s, instance: %s, dns: %s"
                            % (self.cloud.name, self.reservation.id, self.instance_id, self.dns),
                        )

                        break
                if master_selected == False:
                    print("Master node has not been selected. Looping through the list of existing reservations again.")