def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == 'verbose') resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, quota.numCpus, quota.ramMb, quota.diskMb)) new_cpu = float(cpu + quota.numCpus) new_ram = int((ram + Amount(quota.ramMb, Data.MB)).as_(Data.MB)) new_disk = int((disk + Amount(quota.diskMb, Data.MB)).as_(Data.MB)) log.info('Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram, new_disk)) resp = client.set_quota(role, new_cpu, new_ram, new_disk) check_and_log_response(resp)
def mock_api(cls): api = AuroraClientAPI(Cluster(name="foo"), 'test-client') mock_proxy = create_autospec(spec=SchedulerProxyApiSpec, spec_set=True, instance=True) api._scheduler_proxy = mock_proxy return api, mock_proxy
def scheduler_snapshot(cluster): """usage: scheduler_snapshot cluster Request that the scheduler perform a storage snapshot and block until complete. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).snapshot())
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity) .unload_recovery())
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def make_admin_client(cluster): if cluster not in CLUSTERS: die('Unknown cluster: %s. Known clusters: %s' % (cluster, ", ".join(CLUSTERS.keys()))) verbose = getattr(app.get_options(), 'verbosity', 'normal') == 'verbose' return AuroraClientAPI(CLUSTERS[cluster], AURORA_ADMIN_USER_AGENT_NAME, verbose=verbose)
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(',')) options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity) .delete_recovery_tasks(TaskQuery(taskIds=ids)))
def get_scheduler(cluster): """usage: get_scheduler CLUSTER Dumps the leading scheduler endpoint URL. """ options = app.get_options() print("Found leading scheduler at: %s" % AuroraClientAPI( CLUSTERS[cluster], options.verbosity).scheduler_proxy.scheduler_client().raw_url)
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print('%s available backups:' % len(backups)) for backup in backups: print(backup)
def get_locks(cluster): """usage: get_locks cluster Prints all context/operation locks in the scheduler. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_locks() check_and_log_response(resp) pp = pprint.PrettyPrinter(indent=2) def pretty_print_lock(lock): return pp.pformat(vars(lock)) print_results([',\n'.join(pretty_print_lock(t) for t in resp.result.getLocksResult.locks)])
def __init__(self, cluster, role, env, jobs, ssh_user=None, log_fn=log.log): self._cluster = cluster self._api = AuroraClientAPI(cluster=cluster) self._role = role self._env = env self._jobs = jobs self._ssh_user = ssh_user if ssh_user else self._role self._log = log_fn
def __init__(self, cluster, role, env, jobs, ssh_user=None, log_fn=log.log): self._cluster = cluster self._api = AuroraClientAPI(cluster=cluster, user_agent=AURORA_V2_USER_AGENT_NAME) self._role = role self._env = env self._jobs = jobs self._ssh_user = ssh_user if ssh_user else self._role self._log = log_fn
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery( TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info('Role\tJob\tShard\tStatus\tTask ID') for task in resp.result.queryRecoveryResult.tasks: assigned = task.assignedTask conf = assigned.task log.info('\t'.join((conf.owner.role, conf.jobName, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))
def test_handles_api_auth_error(): context = AuroraCommandContext() mock_scheduler_proxy = mock.create_autospec(spec=SchedulerProxyApiSpec, instance=True) mock_scheduler_proxy.killTasks.side_effect = SchedulerProxy.AuthError() mock_api = AuroraClientAPI(TEST_CLUSTER, 'user-agent') mock_api._scheduler_proxy = mock_scheduler_proxy context.apis = {TEST_CLUSTER.name: mock_api} api = context.get_api(TEST_CLUSTER.name, clusters={TEST_CLUSTER.name: TEST_CLUSTER}) with pytest.raises(Context.CommandError) as e: api.kill_job(AuroraJobKey(TEST_CLUSTER.name, 'role', 'env', 'job')) assert e.value.code == EXIT_AUTH_ERROR assert mock_scheduler_proxy.killTasks.call_count == 1
def set_quota(cluster, role, cpu_str, ram, disk): """usage: set_quota cluster role cpu ram[MGT] disk[MGT] Alters the amount of production quota allocated to a user. """ try: ram_size = parse_data(ram).as_(Data.MB) disk_size = parse_data(disk).as_(Data.MB) except ValueError as e: die(str(e)) try: cpu = float(cpu_str) ram_mb = int(ram_size) disk_mb = int(disk_size) except ValueError as e: die(str(e)) options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).set_quota(role, cpu, ram_mb, disk_mb) check_and_log_response(resp)
def make_admin_client(cluster, verbose=False, bypass_leader_redirect=False): """Creates an API client with the specified options for use in admin commands. :param cluster: The cluster to connect with. :type cluster: Either a string cluster name or a Cluster object. :param verbose: Should the client emit verbose output. :type verbose: bool :type bypass_leader_redirect: Should the client bypass the scheduler's leader redirect filter. :type bypass_leader_redirect: bool :rtype: an AuroraClientAPI instance. """ is_cluster_object = isinstance(cluster, Cluster) if not is_cluster_object and cluster not in CLUSTERS: die('Unknown cluster: %s. Known clusters: %s' % (cluster, ", ".join(CLUSTERS.keys()))) return AuroraClientAPI(cluster if is_cluster_object else CLUSTERS[cluster], AURORA_ADMIN_USER_AGENT_NAME, verbose=verbose, bypass_leader_redirect=bypass_leader_redirect)
def sla_probe_hosts(cluster, percentage, duration): """usage: sla_probe_hosts [--filename=FILENAME] [--grouping=GROUPING] [--hosts=HOSTS] [--min_job_instance_count=COUNT] cluster percentage duration Probes individual hosts with respect to their job SLA. Specifically, given a host, outputs all affected jobs with their projected SLAs if the host goes down. In addition, if a job's projected SLA does not clear the specified limits suggests the approximate time when that job reaches its SLA. Output format: HOST JOB PREDICTED_SLA SAFE? PREDICTED_SAFE_IN where: HOST - host being probed. JOB - job that has tasks running on the host being probed. PREDICTED_SLA - predicted effective percentage of up tasks if the host is shut down. SAFE? - PREDICTED_SLA >= percentage PREDICTED_SAFE_IN - expected wait time in seconds for the job to reach requested SLA threshold. """ options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) hosts = parse_hostnames(options.filename, options.hosts) get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, hosts) groups = vector.probe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), options.grouping) output, _ = format_sla_results(groups) print_results(output)
def __init__(self, cluster, verbosity): self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
def __init__(self, cluster, verbosity, wait_event=None): self._client = AuroraClientAPI(cluster, verbosity == 'verbose') self._wait_event = wait_event or Event()
def mock_api(cls): api = AuroraClientAPI(Cluster(name="foo")) mock_proxy = Mock() api._scheduler_proxy = mock_proxy return api, mock_proxy
def query(args, options): """usage: query [--force] [--listformat=FORMAT] [--shards=N[,N,...]] [--states=State[,State,...]] cluster [role [job]] Query Mesos about jobs and tasks. """ def _convert_fmt_string(fmtstr): import re def convert(match): return "%%(%s)s" % match.group(1) return re.sub(r'%(\w+)%', convert, fmtstr) def flatten_task(t, d={}): for key in t.__dict__.keys(): val = getattr(t, key) try: val.__dict__.keys() except AttributeError: d[key] = val else: flatten_task(val, d) return d def map_values(d): default_value = lambda v: v mapping = { 'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v], } return dict( (k, mapping.get(k, default_value)(v)) for (k, v) in d.items() ) for state in options.states.split(','): if state not in ScheduleStatus._NAMES_TO_VALUES: msg = "Unknown state '%s' specified. Valid states are:\n" % state msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys()) die(msg) # Role, Job, Instances, States, and the listformat if len(args) == 0: die('Must specify at least cluster.') cluster = args[0] role = args[1] if len(args) > 1 else None job = args[2] if len(args) > 2 else None instances = set(map(int, options.shards.split(','))) if options.shards else set() if options.states: states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(','))) else: states = ACTIVE_STATES | TERMINAL_STATES listformat = _convert_fmt_string(options.listformat) # Figure out "expensive" queries here and bone if they do not have --force # - Does not specify role if not role and not options.force: die('--force is required for expensive queries (no role specified)') # - Does not specify job if not job and not options.force: die('--force is required for expensive queries (no job specified)') # - Specifies status outside of ACTIVE_STATES if not (states <= ACTIVE_STATES) and not options.force: die('--force is required for expensive queries (states outside ACTIVE states') api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity) query_info = api.query(api.build_query(role, job, instances=instances, statuses=states)) if query_info.responseCode != ResponseCode.OK: die('Failed to query scheduler: %s' % query_info.messageDEPRECATED) tasks = query_info.result.scheduleStatusResult.tasks if tasks is None: return try: for task in tasks: d = flatten_task(task) print(listformat % map_values(d)) except KeyError: msg = "Unknown key in format string. Valid keys are:\n" msg += ','.join(d.keys()) die(msg)
def sla_list_safe_domain(cluster, percentage, duration): """usage: sla_list_safe_domain [--exclude_file=FILENAME] [--exclude_hosts=HOSTS] [--grouping=GROUPING] [--include_file=FILENAME] [--include_hosts=HOSTS] [--list_jobs] [--min_job_instance_count=COUNT] [--override_jobs=FILENAME] cluster percentage duration Returns a list of relevant hosts where it would be safe to kill tasks without violating their job SLA. The SLA is defined as a pair of percentage and duration, where: percentage - Percentage of tasks required to be up within the duration. Applied to all jobs except those listed in --override_jobs file; duration - Time interval (now - value) for the percentage of up tasks. Applied to all jobs except those listed in --override_jobs file. Format: XdYhZmWs (each field is optional but must be in that order.) Examples: 5m, 1d3h45m. NOTE: if --grouping option is specified and is set to anything other than default (by_host) the results will be processed and filtered based on the grouping function on a all-or-nothing basis. In other words, the group is 'safe' IFF it is safe to kill tasks on all hosts in the group at the same time. """ def parse_jobs_file(filename): result = {} with open(filename, 'r') as overrides: for line in overrides: if not line.strip(): continue tokens = line.split() if len(tokens) != 3: die('Invalid line in %s:%s' % (filename, line)) job_key = AuroraJobKey.from_path(tokens[0]) result[job_key] = JobUpTimeLimit( job=job_key, percentage=parse_sla_percentage(tokens[1]), duration_secs=parse_time(tokens[2]).as_(Time.SECONDS) ) return result options = app.get_options() sla_percentage = parse_sla_percentage(percentage) sla_duration = parse_time(duration) exclude_hosts = parse_hostnames_optional(options.exclude_hosts, options.exclude_filename) include_hosts = parse_hostnames_optional(options.include_hosts, options.include_filename) override_jobs = parse_jobs_file(options.override_filename) if options.override_filename else {} get_grouping_or_die(options.grouping) vector = AuroraClientAPI( CLUSTERS[cluster], options.verbosity).sla_get_safe_domain_vector(options.min_instance_count, include_hosts) groups = vector.get_safe_hosts(sla_percentage, sla_duration.as_(Time.SECONDS), override_jobs, options.grouping) results = [] for group in groups: for host in sorted(group.keys()): if exclude_hosts and host in exclude_hosts: continue if options.list_jobs: results.append('\n'.join(['%s\t%s\t%.2f\t%d' % (host, d.job.to_path(), d.percentage, d.duration_secs) for d in sorted(group[host])])) else: results.append('%s' % host) print_results(results)