Exemplo n.º 1
0
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
Exemplo n.º 2
0
def get_locks(cluster):
  """usage: get_locks cluster

  Prints all context/operation locks in the scheduler.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks()
  check_and_log_response(resp)

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_lock(lock):
    return pp.pformat(vars(lock))

  print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
Exemplo n.º 3
0
def get_locks(cluster):
  """usage: get_locks cluster

  Prints all context/operation locks in the scheduler.
  """
  options = app.get_options()
  resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks()
  check_and_log_response(resp)

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_lock(lock):
    return pp.pformat(vars(lock))

  print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
Exemplo n.º 4
0
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(
            self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage,
                                         duration.as_(Time.SECONDS),
                                         grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error(
                    'Illegal multiple groups detected in SLA results. Skipping hosts: %s'
                    % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups,
                                                           unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
Exemplo n.º 5
0
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, hosts
    )
    groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

    output, _ = format_sla_results(groups)
    print_results(output)
Exemplo n.º 6
0
def sla_probe_hosts(cluster, percentage, duration):
  """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)
  hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts)
  groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

  output, _ = format_sla_results(groups)
  print_results(output)
Exemplo n.º 7
0
def sla_list_safe_domain(cluster, percentage, duration):
    """usage: sla_list_safe_domain
            [--exclude_file=FILENAME]
            [--exclude_hosts=HOSTS]
            [--grouping=GROUPING]
            [--include_file=FILENAME]
            [--include_hosts=HOSTS]
            [--list_jobs]
            [--min_job_instance_count=COUNT]
            [--override_jobs=FILENAME]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.

  NOTE: if --grouping option is specified and is set to anything other than
        default (by_host) the results will be processed and filtered based
        on the grouping function on a all-or-nothing basis. In other words,
        the group is 'safe' IFF it is safe to kill tasks on all hosts in the
        group at the same time.
  """

    def parse_jobs_file(filename):
        result = {}
        with open(filename, "r") as overrides:
            for line in overrides:
                if not line.strip():
                    continue

                tokens = line.split()
                if len(tokens) != 3:
                    die("Invalid line in %s:%s" % (filename, line))
                job_key = AuroraJobKey.from_path(tokens[0])
                result[job_key] = JobUpTimeLimit(
                    job=job_key,
                    percentage=parse_sla_percentage(tokens[1]),
                    duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
                )
        return result

    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)

    exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
    include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
    override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
    get_grouping_or_die(options.grouping)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(
        options.min_instance_count, include_hosts
    )
    groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping)

    results = []
    for group in groups:
        for host in sorted(group.keys()):
            if exclude_hosts and host in exclude_hosts:
                continue

            if options.list_jobs:
                results.append(
                    "\n".join(
                        [
                            "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
                            for d in sorted(group[host])
                        ]
                    )
                )
            else:
                results.append("%s" % host)

    print_results(results)
Exemplo n.º 8
0
def sla_list_safe_domain(cluster, percentage, duration):
  """usage: sla_list_safe_domain
            [--exclude_file=FILENAME]
            [--exclude_hosts=HOSTS]
            [--grouping=GROUPING]
            [--include_file=FILENAME]
            [--include_hosts=HOSTS]
            [--list_jobs]
            [--min_job_instance_count=COUNT]
            [--override_jobs=FILENAME]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.

  NOTE: if --grouping option is specified and is set to anything other than
        default (by_host) the results will be processed and filtered based
        on the grouping function on a all-or-nothing basis. In other words,
        the group is 'safe' IFF it is safe to kill tasks on all hosts in the
        group at the same time.
  """
  def parse_jobs_file(filename):
    result = {}
    with open(filename, 'r') as overrides:
      for line in overrides:
        if not line.strip():
          continue

        tokens = line.split()
        if len(tokens) != 3:
          die('Invalid line in %s:%s' % (filename, line))
        job_key = AuroraJobKey.from_path(tokens[0])
        result[job_key] = JobUpTimeLimit(
            job=job_key,
            percentage=parse_sla_percentage(tokens[1]),
            duration_secs=parse_time(tokens[2]).as_(Time.SECONDS)
        )
    return result

  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)

  exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
  include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
  override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
  get_grouping_or_die(options.grouping)

  vector = AuroraClientAPI(
      CLUSTERS[cluster],
      options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, include_hosts)
  groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS),
      override_jobs, options.grouping)

  results = []
  for group in groups:
    for host in sorted(group.keys()):
      if exclude_hosts and host in exclude_hosts:
        continue

      if options.list_jobs:
        results.append('\n'.join(['%s\t%s\t%.2f\t%d' %
            (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])]))
      else:
        results.append('%s' % host)

  print_results(results)