Exemplo n.º 1
0
def test_bad():
    bad_strings = ['foo', 'dhms', '1s30d', 'a b c d', '  ', '1s2s3s']
    for bad_string in bad_strings:
        with pytest.raises(InvalidTime):
            parse_time(bad_string)

    bad_strings = [123, type]
    for bad_string in bad_strings:
        with pytest.raises(TypeError):
            parse_time(bad_string)
Exemplo n.º 2
0
def test_bad():
  bad_strings = ['foo', 'dhms', '1s30d', 'a b c d', '  ', '1s2s3s']
  for bad_string in bad_strings:
    with pytest.raises(InvalidTime):
      parse_time(bad_string)

  bad_strings = [123, type]
  for bad_string in bad_strings:
    with pytest.raises(TypeError):
      parse_time(bad_string)
Exemplo n.º 3
0
def parse_and_validate_sla_drain_default(options):
    """Parses and validates host SLA default 3-tuple (percentage, duration, timeout).

  :param options: command line options
  :type options: list of app.option
  :rtype: a tuple of: default percentage (float), default duration (Amount) and timeout (Amount)
  """
    percentage = parse_sla_percentage(options.default_percentage)
    duration = parse_time(options.default_duration).as_(Time.SECONDS)
    timeout = parse_time(options.timeout).as_(Time.SECONDS)

    return percentage, duration, timeout
Exemplo n.º 4
0
def parse_and_validate_sla_drain_default(options):
  """Parses and validates host SLA default 3-tuple (percentage, duration, timeout).

  :param options: command line options
  :type options: list of app.option
  :rtype: a tuple of: default percentage (float), default duration (Amount) and timeout (Amount)
  """
  percentage = parse_sla_percentage(options.default_percentage)
  duration = parse_time(options.default_duration).as_(Time.SECONDS)
  timeout = parse_time(options.timeout).as_(Time.SECONDS)

  return percentage, duration, timeout
Exemplo n.º 5
0
def parse_and_validate_sla_overrides(options, hostnames):
  """Parses and validates host SLA override 3-tuple (percentage, duration, reason).

  In addition, logs an admin message about overriding default SLA values.

  :param options: command line options
  :type options: list of app.option
  :param hostnames: host names override is issued to
  :type hostnames: list of string
  :rtype: a tuple of: override percentage (float) and override duration (Amount)
  """
  has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
  all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
  if has_override != all_overrides:
    die('All --override_* options are required when attempting to override default SLA values.')

  print(options.percentage)
  percentage = parse_sla_percentage(options.percentage) if options.percentage else None
  duration = parse_time(options.duration) if options.duration else None
  if options.reason:
    log_admin_message(
      logging.WARNING,
      'Default SLA values (percentage: %s, duration: %s) are overridden for the following '
      'hosts: %s. New percentage: %s, duration: %s, override reason: %s' % (
        SLA_UPTIME_PERCENTAGE_LIMIT,
        SLA_UPTIME_DURATION_LIMIT,
        hostnames,
        percentage,
        duration,
        options.reason))

  return percentage or SLA_UPTIME_PERCENTAGE_LIMIT, duration or SLA_UPTIME_DURATION_LIMIT
Exemplo n.º 6
0
def sla_probe_hosts(cluster, percentage, duration):
  """usage: sla_probe_hosts
            [--filename=FILENAME]
            [--grouping=GROUPING]
            [--hosts=HOSTS]
            [--min_job_instance_count=COUNT]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)
  hosts = parse_hostnames(options.filename, options.hosts)
  get_grouping_or_die(options.grouping)

  vector = make_admin_client(cluster).sla_get_safe_domain_vector(options.min_instance_count, hosts)
  groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping)

  output, _ = format_sla_results(groups)
  print_results(output)
Exemplo n.º 7
0
def parse_and_validate_sla_overrides(options, hostnames):
    """Parses and validates host SLA override 3-tuple (percentage, duration, reason).

  In addition, logs an admin message about overriding default SLA values.

  :param options: command line options
  :type options: list of app.option
  :param hostnames: host names override is issued to
  :type hostnames: list of string
  :rtype: a tuple of: override percentage (float) and override duration (Amount)
  """
    has_override = bool(options.percentage) or bool(options.duration) or bool(
        options.reason)
    all_overrides = bool(options.percentage) and bool(
        options.duration) and bool(options.reason)
    if has_override != all_overrides:
        die('All --override_* options are required when attempting to override default SLA values.'
            )

    print(options.percentage)
    percentage = parse_sla_percentage(
        options.percentage) if options.percentage else None
    duration = parse_time(options.duration) if options.duration else None
    if options.reason:
        log_admin_message(
            logging.WARNING,
            'Default SLA values (percentage: %s, duration: %s) are overridden for the following '
            'hosts: %s. New percentage: %s, duration: %s, override reason: %s'
            % (SLA_UPTIME_PERCENTAGE_LIMIT, SLA_UPTIME_DURATION_LIMIT,
               hostnames, percentage, duration, options.reason))

    return percentage or SLA_UPTIME_PERCENTAGE_LIMIT, duration or SLA_UPTIME_DURATION_LIMIT
Exemplo n.º 8
0
def gc(args, options):
  """Garbage collect task(s) and task metadata.

    Usage: thermos gc [options] [task_id1 task_id2 ...]

    If tasks specified, restrict garbage collection to only those tasks,
    otherwise all tasks are considered.  The optional constraints are still
    honored.
  """
  print('Analyzing root at %s' % options.root)
  gc_options = {}
  if options.max_age is not None:
    gc_options['max_age'] = parse_time(options.max_age)
  if options.max_space is not None:
    gc_options['max_space'] = parse_data(options.max_space)
  if options.max_tasks is not None:
    gc_options['max_tasks'] = int(options.max_tasks)
  gc_options.update(include_metadata=not options.keep_metadata,
                    include_logs=not options.keep_logs,
                    verbose=True,
                    logger=print)
  if args:
    gc_tasks = list(tasks_from_re(args, state='finished'))
  else:
    print('No task ids specified, using default collector.')
    gc_tasks = [(task.checkpoint_root, task.task_id)
        for task in GarbageCollectionPolicy(get_path_detector(), **gc_options).run()]

  if not gc_tasks:
    print('No tasks to garbage collect.  Exiting')
    return

  def maybe(function, *args):
    if options.dryrun:
      print('    would run %s%r' % (function.__name__, args))
    else:
      function(*args)

  value = 'y'
  if not options.force:
    value = raw_input("Continue [y/N]? ") or 'N'
  if value.lower() == 'y':
    print('Running gc...')

    for checkpoint_root, task_id in gc_tasks:
      tgc = TaskGarbageCollector(checkpoint_root, task_id)
      print('  Task %s ' % task_id, end='')
      print('data (%s) ' % ('keeping' if options.keep_data else 'deleting'), end='')
      print('logs (%s) ' % ('keeping' if options.keep_logs else 'deleting'), end='')
      print('metadata (%s) ' % ('keeping' if options.keep_metadata else 'deleting'))
      if not options.keep_data:
        maybe(tgc.erase_data)
      if not options.keep_logs:
        maybe(tgc.erase_logs)
      if not options.keep_metadata:
        maybe(tgc.erase_metadata)
      print('done.')
  else:
    print('Cancelling gc.')
Exemplo n.º 9
0
def test_basic():
    assert parse_time('') == Amount(0, Time.SECONDS)
    assert parse_time('1s') == Amount(1, Time.SECONDS)
    assert parse_time('2m60s') == Amount(3, Time.MINUTES)
    assert parse_time('1d') == Amount(1, Time.DAYS)
    assert parse_time('1d1H3600s') == Amount(26, Time.HOURS)
    assert parse_time('1d-1s') == Amount(86399, Time.SECONDS)
Exemplo n.º 10
0
def test_basic():
  assert parse_time('') == Amount(0, Time.SECONDS)
  assert parse_time('1s') == Amount(1, Time.SECONDS)
  assert parse_time('2m60s') == Amount(3, Time.MINUTES)
  assert parse_time('1d') == Amount(1, Time.DAYS)
  assert parse_time('1d1H3600s') == Amount(26, Time.HOURS)
  assert parse_time('1d-1s') == Amount(86399, Time.SECONDS)
Exemplo n.º 11
0
def perform_maintenance_hosts(cluster):
    """usage: perform_maintenance_hosts {--filename=filename | --hosts=hosts}
                                      [--post_drain_script=path]
                                      [--grouping=function]
                                      [--override_percentage=percentage]
                                      [--override_duration=duration]
                                      [--override_reason=reason]
                                      [--unsafe_hosts_file=unsafe_hosts_filename]
                                      cluster

  Asks the scheduler to remove any running tasks from the machine and remove it
  from service temporarily, perform some action on them, then return the machines
  to service.
  """
    options = app.get_options()
    drainable_hosts = parse_hostnames(options.filename, options.hosts)
    get_grouping_or_die(options.grouping)

    has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
    all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
    if has_override != all_overrides:
        die("All --override_* options are required when attempting to override default SLA values.")

    percentage = parse_sla_percentage(options.percentage) if options.percentage else None
    duration = parse_time(options.duration) if options.duration else None
    if options.reason:
        log_admin_message(
            logging.WARNING,
            "Default SLA values (percentage: %s, duration: %s) are overridden for the following "
            "hosts: %s. New percentage: %s, duration: %s, override reason: %s"
            % (
                HostMaintenance.SLA_UPTIME_PERCENTAGE_LIMIT,
                HostMaintenance.SLA_UPTIME_DURATION_LIMIT,
                drainable_hosts,
                percentage,
                duration,
                options.reason,
            ),
        )

    drained_callback = parse_script(options.post_drain_script)

    HostMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
        drainable_hosts,
        grouping_function=options.grouping,
        callback=drained_callback,
        percentage=percentage,
        duration=duration,
        output_file=options.unsafe_hosts_filename,
    )
Exemplo n.º 12
0
def sla_probe_hosts(cluster, percentage, duration):
    """usage: sla_probe_hosts
            [--filename=filename]
            [--hosts=hosts]
            cluster percentage duration

  Probes individual hosts with respect to their job SLA.
  Specifically, given a host, outputs all affected jobs with their projected SLAs
  if the host goes down. In addition, if a job's projected SLA does not clear
  the specified limits suggests the approximate time when that job reaches its SLA.

  Output format:
  HOST  JOB  PREDICTED_SLA  SAFE?  PREDICTED_SAFE_IN

  where:
  HOST - host being probed.
  JOB - job that has tasks running on the host being probed.
  PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down.
  SAFE? - PREDICTED_SLA >= percentage
  PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold.
  """
    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)
    hosts = parse_hosts(options.filename, options.hosts)

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(hosts)
    probed_hosts = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), hosts)

    results = []
    for host, job_details in sorted(probed_hosts.items()):
        results.append(
            "\n".join(
                [
                    "%s\t%s\t%.2f\t%s\t%s"
                    % (
                        host,
                        d.job.to_path(),
                        d.predicted_percentage,
                        d.safe,
                        "n/a" if d.safe_in_secs is None else d.safe_in_secs,
                    )
                    for d in sorted(job_details)
                ]
            )
        )

    print_results(results)
Exemplo n.º 13
0
    def parse_jobs_file(filename):
        result = {}
        with open(filename, 'r') as overrides:
            for line in overrides:
                if not line.strip():
                    continue

                tokens = line.split()
                if len(tokens) != 3:
                    die('Invalid line in %s:%s' % (filename, line))
                job_key = AuroraJobKey.from_path(tokens[0])
                result[job_key] = JobUpTimeLimit(
                    job=job_key,
                    percentage=parse_sla_percentage(tokens[1]),
                    duration_secs=parse_time(tokens[2]).as_(Time.SECONDS))
        return result
Exemplo n.º 14
0
  def parse_jobs_file(filename):
    result = {}
    with open(filename, 'r') as overrides:
      for line in overrides:
        if not line.strip():
          continue

        tokens = line.split()
        if len(tokens) != 3:
          die('Invalid line in %s:%s' % (filename, line))
        job_key = AuroraJobKey.from_path(tokens[0])
        result[job_key] = JobUpTimeLimit(
            job=job_key,
            percentage=parse_sla_percentage(tokens[1]),
            duration_secs=parse_time(tokens[2]).as_(Time.SECONDS)
        )
    return result
Exemplo n.º 15
0
def gc(args, options):
  """Garbage collect task(s) and task metadata.

    Usage: thermos gc [options] [task_id1 task_id2 ...]

    If tasks specified, restrict garbage collection to only those tasks,
    otherwise all tasks are considered.  The optional constraints are still
    honored.

    Options:
      --max_age=AGE		Max age in quasi-human readable form, e.g. --max_age=2d5h,
                                format *d*h*m*s [default: skip]
      --max_tasks=NUM		Max number of tasks to keep [default: skip]
      --max_space=SPACE		Max space to allow for tasks [default: skip]
      --[keep/delete-]metadata	Garbage collect metadata [default: keep]
      --[keep/delete-]logs	Garbage collect logs [default: keep]
      --[keep/delete-]data	Garbage collect data [default: keep]
                                WARNING: Do NOT do this if your sandbox is $HOME.
      --force			Perform garbage collection without confirmation [default: false]
      --dryrun			Don't actually run garbage collection [default: false]
  """
  print('Analyzing root at %s' % options.root)
  gc_options = {}
  if options.max_age is not None:
    gc_options['max_age'] = parse_time(options.max_age)
  if options.max_space is not None:
    gc_options['max_space'] = parse_data(options.max_space)
  if options.max_tasks is not None:
    gc_options['max_tasks'] = int(options.max_tasks)
  gc_options.update(include_data=not options.keep_data,
                    include_metadata=not options.keep_metadata,
                    include_logs=not options.keep_logs,
                    verbose=True,
                    logger=print)
  tgc = TaskGarbageCollector(root=options.root)

  if args:
    gc_tasks = tasks_from_re(args, options.root, state='finished')
  else:
    print('No task ids specified, using default collector.')
    gc_tasks = [task.task_id for task in DefaultCollector(tgc, **gc_options).run()]

  if not gc_tasks:
    print('No tasks to garbage collect.  Exiting')
    return

  def maybe(function, *args):
    if options.dryrun:
      print('    would run %s%r' % (function.__name__, args))
    else:
      function(*args)

  value = 'y'
  if not options.force:
    value = raw_input("Continue [y/N]? ") or 'N'
  if value.lower() == 'y':
    print('Running gc...')
    tgc = TaskGarbageCollector(root=options.root)
    for task in gc_tasks:
      print('  Task %s ' % task, end='')
      print('data (%s) ' % ('keeping' if options.keep_data else 'deleting'), end='')
      print('logs (%s) ' % ('keeping' if options.keep_logs else 'deleting'), end='')
      print('metadata (%s) ' % ('keeping' if options.keep_metadata else 'deleting'))
      if not options.keep_data:
        maybe(tgc.erase_data, task)
      if not options.keep_logs:
        maybe(tgc.erase_logs, task)
      if not options.keep_metadata:
        maybe(tgc.erase_metadata, task)
      print('done.')
  else:
    print('Cancelling gc.')
Exemplo n.º 16
0
def sla_list_safe_domain(cluster, percentage, duration):
  """usage: sla_list_safe_domain
            [--exclude_file=FILENAME]
            [--exclude_hosts=HOSTS]
            [--grouping=GROUPING]
            [--include_file=FILENAME]
            [--include_hosts=HOSTS]
            [--list_jobs]
            [--min_job_instance_count=COUNT]
            [--override_jobs=FILENAME]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.

  NOTE: if --grouping option is specified and is set to anything other than
        default (by_host) the results will be processed and filtered based
        on the grouping function on a all-or-nothing basis. In other words,
        the group is 'safe' IFF it is safe to kill tasks on all hosts in the
        group at the same time.
  """
  def parse_jobs_file(filename):
    result = {}
    with open(filename, 'r') as overrides:
      for line in overrides:
        if not line.strip():
          continue

        tokens = line.split()
        if len(tokens) != 3:
          die('Invalid line in %s:%s' % (filename, line))
        job_key = AuroraJobKey.from_path(tokens[0])
        result[job_key] = JobUpTimeLimit(
            job=job_key,
            percentage=parse_sla_percentage(tokens[1]),
            duration_secs=parse_time(tokens[2]).as_(Time.SECONDS)
        )
    return result

  options = app.get_options()

  sla_percentage = parse_sla_percentage(percentage)
  sla_duration = parse_time(duration)

  exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename)
  include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename)
  override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}
  get_grouping_or_die(options.grouping)

  vector = make_admin_client(cluster).sla_get_safe_domain_vector(
      options.min_instance_count,
      include_hosts)
  groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS),
      override_jobs, options.grouping)

  results = []
  for group in groups:
    for host in sorted(group.keys()):
      if exclude_hosts and host in exclude_hosts:
        continue

      if options.list_jobs:
        results.append('\n'.join(['%s\t%s\t%.2f\t%d' %
            (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])]))
      else:
        results.append('%s' % host)

  print_results(results)
Exemplo n.º 17
0
  def main(args, options):
    log.info("Options in use: %s", options)

    if not options.api_port:
      app.error('Must specify --port')

    if not options.mesos_master:
      app.error('Must specify --mesos_master')

    if not options.framework_user:
      app.error('Must specify --framework_user')

    if not options.executor_uri:
      app.error('Must specify --executor_uri')

    if not options.executor_cmd:
      app.error('Must specify --executor_cmd')

    if not options.zk_url:
      app.error('Must specify --zk_url')

    if not options.admin_keypath:
      app.error('Must specify --admin_keypath')

    try:
      election_timeout = parse_time(options.election_timeout)
      framework_failover_timeout = parse_time(options.framework_failover_timeout)
    except InvalidTime as e:
      app.error(e.message)

    try:
      _, zk_servers, zk_root = zookeeper.parse(options.zk_url)
    except Exception as e:
      app.error("Invalid --zk_url: %s" % e.message)

    web_assets_dir = os.path.join(options.work_dir, "web")
    pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH)
    log.info("Extracted web assets into %s" % options.work_dir)

    fw_principal = None
    fw_secret = None
    if options.framework_authentication_file:
      try:
        with open(options.framework_authentication_file, "r") as f:
          cred = yaml.load(f)
        fw_principal = cred["principal"]
        fw_secret = cred["secret"]
        log.info("Loaded credential (principal=%s) for framework authentication" % fw_principal)
      except IOError as e:
        app.error("Unable to read the framework authentication key file: %s" % e)
      except (KeyError, yaml.YAMLError) as e:
        app.error("Invalid framework authentication key file format %s" % e)

    log.info("Starting Mysos scheduler")

    kazoo = KazooClient(zk_servers)
    kazoo.start()

    if options.state_storage == 'zk':
      log.info("Using ZooKeeper (path: %s) for state storage" % zk_root)
      state_provider = ZooKeeperStateProvider(kazoo, zk_root)
    else:
      log.info("Using local disk for state storage")
      state_provider = LocalStateProvider(options.work_dir)

    try:
      state = state_provider.load_scheduler_state()
    except StateProvider.Error as e:
      app.error(e.message)

    if state:
      log.info("Successfully restored scheduler state")
      framework_info = state.framework_info
      if framework_info.HasField('id'):
        log.info("Recovered scheduler's FrameworkID is %s" % framework_info.id.value)
    else:
      log.info("No scheduler state to restore")
      framework_info = FrameworkInfo(
          user=options.framework_user,
          name=FRAMEWORK_NAME,
          checkpoint=True,
          failover_timeout=framework_failover_timeout.as_(Time.SECONDS),
          role=options.framework_role)
      if fw_principal:
        framework_info.principal = fw_principal
      state = Scheduler(framework_info)
      state_provider.dump_scheduler_state(state)

    scheduler = MysosScheduler(
        state,
        state_provider,
        options.framework_user,
        options.executor_uri,
        options.executor_cmd,
        kazoo,
        options.zk_url,
        election_timeout,
        options.admin_keypath,
        installer_args=options.installer_args,
        backup_store_args=options.backup_store_args,
        executor_environ=options.executor_environ,
        framework_role=options.framework_role)

    if fw_principal and fw_secret:
      cred = Credential(principal=fw_principal, secret=fw_secret)
      scheduler_driver = mesos.native.MesosSchedulerDriver(
          scheduler,
          framework_info,
          options.mesos_master,
          cred)
    else:
      scheduler_driver = mesos.native.MesosSchedulerDriver(
          scheduler,
          framework_info,
          options.mesos_master)

    scheduler_driver.start()

    server = HttpServer()
    server.mount_routes(MysosServer(scheduler, web_assets_dir))

    et = ExceptionalThread(
        target=server.run, args=('0.0.0.0', options.api_port, 'cherrypy'))
    et.daemon = True
    et.start()

    try:
      # Wait for the scheduler to stop.
      # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the
      # process with SIGINT.
      while not scheduler.stopped.wait(timeout=0.5):
        pass
    except KeyboardInterrupt:
      log.info('Interrupted, exiting.')
    else:
      log.info('Scheduler exited.')

    app.shutdown(1)  # Mysos scheduler is supposed to be long-running thus the use of exit status 1.
Exemplo n.º 18
0
def sla_list_safe_domain(cluster, percentage, duration):
    """usage: sla_list_safe_domain
            [--exclude_hosts=filename]
            [--include_hosts=filename]
            [--list_jobs]
            [--override_jobs=filename]
            cluster percentage duration

  Returns a list of relevant hosts where it would be safe to kill
  tasks without violating their job SLA. The SLA is defined as a pair of
  percentage and duration, where:

  percentage - Percentage of tasks required to be up within the duration.
  Applied to all jobs except those listed in --override_jobs file;

  duration - Time interval (now - value) for the percentage of up tasks.
  Applied to all jobs except those listed in --override_jobs file.
  Format: XdYhZmWs (each field is optional but must be in that order.)
  Examples: 5m, 1d3h45m.
  """

    def parse_jobs_file(filename):
        result = {}
        with open(filename, "r") as overrides:
            for line in overrides:
                if not line.strip():
                    continue

                tokens = line.split()
                if len(tokens) != 3:
                    die("Invalid line in %s:%s" % (filename, line))
                job_key = AuroraJobKey.from_path(tokens[0])
                result[job_key] = DomainUpTimeSlaVector.JobUpTimeLimit(
                    job=job_key,
                    percentage=parse_sla_percentage(tokens[1]),
                    duration_secs=parse_time(tokens[2]).as_(Time.SECONDS),
                )
        return result

    options = app.get_options()

    sla_percentage = parse_sla_percentage(percentage)
    sla_duration = parse_time(duration)

    exclude_hosts = parse_hosts_optional(options.exclude_hosts, options.exclude_filename)
    include_hosts = parse_hosts_optional(options.include_hosts, options.include_filename)
    override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {}

    vector = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(include_hosts)
    hosts = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs)

    results = []
    for host in sorted(hosts.keys()):
        if exclude_hosts and host in exclude_hosts:
            continue

        if options.list_jobs:
            results.append(
                "\n".join(
                    [
                        "%s\t%s\t%.2f\t%d" % (host, d.job.to_path(), d.percentage, d.duration_secs)
                        for d in sorted(hosts[host])
                    ]
                )
            )
        else:
            results.append("%s" % host)

    print_results(results)
Exemplo n.º 19
0
 def __init__(self, user):
   self._user = user
   self._max_delay = parse_time(MAX_UPLOAD_DELAY)
Exemplo n.º 20
0
    def main(args, options):
        log.info("Options in use: %s", options)

        if not options.api_port:
            app.error('Must specify --port')

        if not options.mesos_master:
            app.error('Must specify --mesos_master')

        if not options.framework_user:
            app.error('Must specify --framework_user')

        if not options.executor_uri:
            app.error('Must specify --executor_uri')

        if not options.executor_cmd:
            app.error('Must specify --executor_cmd')

        if not options.zk_url:
            app.error('Must specify --zk_url')

        if not options.admin_keypath:
            app.error('Must specify --admin_keypath')

        try:
            election_timeout = parse_time(options.election_timeout)
            framework_failover_timeout = parse_time(
                options.framework_failover_timeout)
        except InvalidTime as e:
            app.error(e.message)

        try:
            _, zk_servers, zk_root = zookeeper.parse(options.zk_url)
        except Exception as e:
            app.error("Invalid --zk_url: %s" % e.message)

        web_assets_dir = os.path.join(options.work_dir, "web")
        pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH)
        log.info("Extracted web assets into %s" % options.work_dir)

        fw_principal = None
        fw_secret = None
        if options.framework_authentication_file:
            try:
                with open(options.framework_authentication_file, "r") as f:
                    cred = yaml.load(f)
                fw_principal = cred["principal"]
                fw_secret = cred["secret"]
                log.info(
                    "Loaded credential (principal=%s) for framework authentication"
                    % fw_principal)
            except IOError as e:
                app.error(
                    "Unable to read the framework authentication key file: %s"
                    % e)
            except (KeyError, yaml.YAMLError) as e:
                app.error(
                    "Invalid framework authentication key file format %s" % e)

        log.info("Starting Mysos scheduler")

        kazoo = KazooClient(zk_servers)
        kazoo.start()

        if options.state_storage == 'zk':
            log.info("Using ZooKeeper (path: %s) for state storage" % zk_root)
            state_provider = ZooKeeperStateProvider(kazoo, zk_root)
        else:
            log.info("Using local disk for state storage")
            state_provider = LocalStateProvider(options.work_dir)

        try:
            state = state_provider.load_scheduler_state()
        except StateProvider.Error as e:
            app.error(e.message)

        if state:
            log.info("Successfully restored scheduler state")
            framework_info = state.framework_info
            if framework_info.HasField('id'):
                log.info("Recovered scheduler's FrameworkID is %s" %
                         framework_info.id.value)
        else:
            log.info("No scheduler state to restore")
            framework_info = FrameworkInfo(
                user=options.framework_user,
                name=FRAMEWORK_NAME,
                checkpoint=True,
                failover_timeout=framework_failover_timeout.as_(Time.SECONDS),
                role=options.framework_role)
            if fw_principal:
                framework_info.principal = fw_principal
            state = Scheduler(framework_info)
            state_provider.dump_scheduler_state(state)

        scheduler = MysosScheduler(state,
                                   state_provider,
                                   options.framework_user,
                                   options.executor_uri,
                                   options.executor_cmd,
                                   kazoo,
                                   options.zk_url,
                                   election_timeout,
                                   options.admin_keypath,
                                   installer_args=options.installer_args,
                                   backup_store_args=options.backup_store_args,
                                   executor_environ=options.executor_environ,
                                   framework_role=options.framework_role)

        if fw_principal and fw_secret:
            cred = Credential(principal=fw_principal, secret=fw_secret)
            scheduler_driver = mesos.native.MesosSchedulerDriver(
                scheduler, framework_info, options.mesos_master, cred)
        else:
            scheduler_driver = mesos.native.MesosSchedulerDriver(
                scheduler, framework_info, options.mesos_master)

        scheduler_driver.start()

        server = HttpServer()
        server.mount_routes(MysosServer(scheduler, web_assets_dir))

        et = ExceptionalThread(target=server.run,
                               args=('0.0.0.0', options.api_port, 'cherrypy'))
        et.daemon = True
        et.start()

        try:
            # Wait for the scheduler to stop.
            # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the
            # process with SIGINT.
            while not scheduler.stopped.wait(timeout=0.5):
                pass
        except KeyboardInterrupt:
            log.info('Interrupted, exiting.')
        else:
            log.info('Scheduler exited.')

        app.shutdown(
            1
        )  # Mysos scheduler is supposed to be long-running thus the use of exit status 1.
Exemplo n.º 21
0
 def __init__(self, user):
     self._user = user
     self._max_delay = parse_time(MAX_UPLOAD_DELAY)