def _check_sla(self, hostnames, grouping_function, percentage, duration): """Check if the provided list of hosts passes the job uptime SLA check. This is an all-or-nothing check, meaning that all provided hosts must pass their job SLA check for the maintenance to proceed. :param hostnames: list of host names to check SLA for :type hostnames: list of strings :param grouping_function: grouping function to apply to the given hosts :type grouping_function: function :param percentage: SLA uptime percentage override :type percentage: float :param duration: SLA uptime duration override :type duration: twitter.common.quantity.Amount :rtype: set of unsafe hosts """ vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames) host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function) unsafe_hostnames = set() # Given that maintenance is performed 1 group at a time, any result longer than 1 group # should be considered a batch failure. if host_groups: if len(host_groups) > 1: log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames) return set(hostnames) results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True) if results: print_results(results) return unsafe_hostnames return unsafe_hostnames
def _check_sla(self, hostnames, grouping_function, percentage, duration): """Check if the provided list of hosts passes the job uptime SLA check. This is an all-or-nothing check, meaning that all provided hosts must pass their job SLA check for the maintenance to proceed. :param hostnames: list of host names to check SLA for :type hostnames: list of strings :param grouping_function: grouping function to apply to the given hosts :type grouping_function: function :param percentage: SLA uptime percentage override :type percentage: float :param duration: SLA uptime duration override :type duration: twitter.common.quantity.Amount :rtype: set of unsafe hosts """ vector = self._client.sla_get_safe_domain_vector( self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames) host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function) unsafe_hostnames = set() # Given that maintenance is performed 1 group at a time, any result longer than 1 group # should be considered a batch failure. if host_groups: if len(host_groups) > 1: log.error( 'Illegal multiple groups detected in SLA results. Skipping hosts: %s' % hostnames) return set(hostnames) results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True) if results: print_results(results) return unsafe_hostnames return unsafe_hostnames
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector( options.min_instance_count, hosts ) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)