def run(args, options): """usage: run cluster/role/env/job cmd Runs a shell command on all machines currently hosting shards of a single job. This feature supports the same command line wildcards that are used to populate a job's commands. This means anything in the {{mesos.*}} and {{thermos.*}} namespaces. """ # TODO(William Farner): Add support for invoking on individual shards. # TODO(Kevin Sweeney): Restore the ability to run across jobs with globs (See MESOS-3010). if not args: die('job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) command = ' '.join(args) cluster = CLUSTERS[cluster_name] dcr = DistributedCommandRunner(cluster, role, env, [name], options.ssh_user) dcr.run(command, parallelism=options.num_threads, executor_sandbox=options.executor_sandbox)
def disambiguate_args_or_die(cls, args, options, client_factory=AuroraClientAPI): """ Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple if one can be found given the args, potentially querying the scheduler with the returned client. Calls die() with an appropriate error message otherwise. Arguments: args: args from app command invocation. options: options from app command invocation. must have env and cluster attributes. client_factory: a callable (cluster) -> AuroraClientAPI. """ if not len(args) > 0: die('job path is required') try: job_key = AuroraJobKey.from_path(args[0]) client = client_factory(job_key.cluster) config_file = args[1] if len( args) > 1 else None # the config for hooks return client, job_key, config_file except AuroraJobKey.Error: log.warning( "Failed to parse job path, falling back to compatibility mode") role = args[0] if len(args) > 0 else None name = args[1] if len(args) > 1 else None env = None config_file = None # deprecated form does not support hooks functionality cluster = options.cluster if not cluster: die('cluster is required') client = client_factory(cluster) return client, cls._disambiguate_or_die(client, role, env, name), config_file
def perform_maintenance_hosts(cluster): """usage: perform_maintenance cluster [--filename=filename] [--hosts=hosts] [--batch_size=num] [--post_drain_script=path] [--grouping=function] Asks the scheduler to remove any running tasks from the machine and remove it from service temporarily, perform some action on them, then return the machines to service. """ options = app.get_options() drainable_hosts = parse_hosts(options) if options.post_drain_script: if not os.path.exists(options.post_drain_script): die("No such file: %s" % options.post_drain_script) cmd = os.path.abspath(options.post_drain_script) drained_callback = lambda host: subprocess.Popen([cmd, host]) else: drained_callback = None MesosMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, batch_size=int(options.batch_size), callback=drained_callback, grouping_function=options.grouping, )
def do_open(args, _): """usage: open cluster[/role[/env/job]] Opens the scheduler page for a cluster, role or job in the default web browser. """ cluster_name = role = env = job = None args = args[0].split("/") if len(args) > 0: cluster_name = args[0] if len(args) > 1: role = args[1] if len(args) > 2: env = args[2] if len(args) > 3: job = args[3] else: # TODO(ksweeney): Remove this after MESOS-2945 is completed. die('env scheduler pages are not yet implemented, please specify job') if not cluster_name: die('cluster is required') api = make_client(cluster_name) import webbrowser webbrowser.open_new_tab(synthesize_url(api.scheduler.scheduler().url, role, env, job))
def disambiguate_args_or_die(cls, args, options, client_factory=AuroraClientAPI): """ Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple if one can be found given the args, potentially querying the scheduler with the returned client. Calls die() with an appropriate error message otherwise. Arguments: args: args from app command invocation. options: options from app command invocation. must have env and cluster attributes. client_factory: a callable (cluster) -> AuroraClientAPI. """ if not len(args) > 0: die('job path is required') try: job_key = AuroraJobKey.from_path(args[0]) client = client_factory(job_key.cluster) config_file = args[1] if len(args) > 1 else None # the config for hooks return client, job_key, config_file except AuroraJobKey.Error: log.warning("Failed to parse job path, falling back to compatibility mode") role = args[0] if len(args) > 0 else None name = args[1] if len(args) > 1 else None env = None config_file = None # deprecated form does not support hooks functionality cluster = options.cluster if not cluster: die('cluster is required') client = client_factory(cluster) return client, cls._disambiguate_or_die(client, role, env, name), config_file
def list_jobs(cluster_and_role): """usage: list_jobs [--show_cron_schedule] cluster/role/env/job""" def show_job_simple(job): if options.show_cron_schedule: print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' + '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job)) else: print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job)) def show_job_pretty(job): print("Job %s/%s/%s/%s:" % (cluster, job.key.role, job.key.environment, job.key.name)) print('\tcron schedule: %s' % job.cronSchedule) print('\tcron policy: %s' % job.cronCollisionPolicy) options = app.get_options() if options.show_cron_schedule and options.pretty: print_fn = show_job_pretty else: print_fn = show_job_simple # Take the cluster_and_role parameter, and split it into its two components. if cluster_and_role.count('/') != 1: die('list_jobs parameter must be in cluster/role format') (cluster,role) = cluster_and_role.split('/') api = make_client(cluster) resp = api.get_jobs(role) check_and_log_response(resp) for job in resp.result.getJobsResult.configs: print_fn(job)
def perform_maintenance_hosts(cluster): """usage: perform_maintenance cluster [--filename=filename] [--hosts=hosts] [--batch_size=num] [--post_drain_script=path] [--grouping=function] Asks the scheduler to remove any running tasks from the machine and remove it from service temporarily, perform some action on them, then return the machines to service. """ options = app.get_options() drainable_hosts = parse_hosts(options) if options.post_drain_script: if not os.path.exists(options.post_drain_script): die("No such file: %s" % options.post_drain_script) cmd = os.path.abspath(options.post_drain_script) drained_callback = lambda host: subprocess.Popen([cmd, host]) else: drained_callback = None MesosMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance( drainable_hosts, batch_size=int(options.batch_size), callback=drained_callback, grouping_function=options.grouping)
def parse_hosts(options): if not (options.filename or options.hosts): die("Please specify either --filename or --hosts") if options.filename: with open(options.filename, "r") as hosts: hosts = [hostname.strip() for hostname in hosts] elif options.hosts: hosts = [hostname.strip() for hostname in options.hosts.split(",")] if not hosts: die("No valid hosts found.") return hosts
def diff(job_spec, config_file): """usage: diff cluster/role/env/job config Compares a job configuration against a running job. By default the diff will be displayed using 'diff', though you may choose an alternate diff program by specifying the DIFF_VIEWER environment variable.""" options = app.get_options() config = get_job_config(job_spec, config_file, options) if options.rename_from: cluster, role, env, name = options.rename_from else: cluster = config.cluster() role = config.role() env = config.environment() name = config.name() api = make_client(cluster) resp = api.query(api.build_query(role, name, statuses=ACTIVE_STATES, env=env)) if resp.responseCode != ResponseCode.OK: die('Request failed, server responded with "%s"' % resp.message) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Request failed, server responded with "%s"' % resp.message) local_tasks = resp.result.populateJobResult.populated pp = pprint.PrettyPrinter(indent=2) def pretty_print_task(task): # The raw configuration is not interesting - we only care about what gets parsed. task.configuration = None task.executorConfig = ExecutorConfig( name=AURORA_EXECUTOR_NAME, data=json.loads(task.executorConfig.data)) return pp.pformat(vars(task)) def pretty_print_tasks(tasks): return ',\n'.join([pretty_print_task(t) for t in tasks]) def dump_tasks(tasks, out_file): out_file.write(pretty_print_tasks(tasks)) out_file.write('\n') out_file.flush() diff_program = os.environ.get('DIFF_VIEWER', 'diff') with NamedTemporaryFile() as local: dump_tasks(local_tasks, local) with NamedTemporaryFile() as remote: dump_tasks(remote_tasks, remote) result = subprocess.call([diff_program, remote.name, local.name]) # Unlike most commands, diff doesn't return zero on success; it returns # 1 when a successful diff is non-empty. if result != 0 and result != 1: return result else: return 0
def parse_hosts(options): if not (options.filename or options.hosts): die('Please specify either --filename or --hosts') if options.filename: with open(options.filename, 'r') as hosts: hosts = [hostname.strip() for hostname in hosts] elif options.hosts: hosts = [hostname.strip() for hostname in options.hosts.split(",")] if not hosts: die('No valid hosts found.') return hosts
def _validate_update_config(config): job_size = config.instances() max_failures = config.update_config().max_total_failures().get() if max_failures >= job_size: die(UPDATE_CONFIG_MAX_FAILURES_ERROR % (job_size, job_size - 1)) if config.is_dedicated(): min_failure_threshold = int(math.floor(job_size * 0.02)) if max_failures < min_failure_threshold: die(UPDATE_CONFIG_DEDICATED_THRESHOLD_ERROR % (job_size, min_failure_threshold))
def help(args): """usage: help [subcommand] Prints help for using the aurora client, or one of its specific subcommands. """ if not args: print(generate_full_usage()) sys.exit(0) if len(args) > 1: die('Please specify at most one subcommand.') subcmd = args[0] if subcmd in app.get_commands(): app.command_parser(subcmd).print_help() else: print('Subcommand %s not found.' % subcmd) sys.exit(1)
def warn_if_dangerous_change(api, job_spec, config): # Get the current job status, so that we can check if there's anything # dangerous about this update. job_key = AuroraJobKey(config.cluster(), config.role(), config.environment(), config.name()) resp = api.query(api.build_query(config.role(), config.name(), statuses=ACTIVE_STATES, env=config.environment())) if resp.responseCode != ResponseCode.OK: die('Could not get job status from server for comparison: %s' % resp.message) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Server could not populate job config for comparison: %s' % resp.message) local_task_count = len(resp.result.populateJobResult.populated) remote_task_count = len(remote_tasks) if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count or local_task_count == 0): print('Warning: this update is a large change. Press ^c within 5 seconds to abort') time.sleep(5)
def _disambiguate_or_die(cls, client, role, env, name): # Returns a single AuroraJobKey if one can be found given the args, potentially # querying the scheduler. Calls die() with an appropriate error message otherwise. try: disambiguator = cls(client, role, env, name) except ValueError as e: die(e) if not disambiguator.ambiguous: return AuroraJobKey(client.cluster.name, role, env, name) deprecation_warning("Job ambiguously specified - querying the scheduler to disambiguate") matches = disambiguator.query_matches() if len(matches) == 1: (match,) = matches log.info("Found job %s" % match) return match elif len(matches) == 0: die("No jobs found") else: die("Multiple jobs match (%s) - disambiguate by using the CLUSTER/ROLE/ENV/NAME form" % ",".join(str(m) for m in matches))
def _disambiguate_or_die(cls, client, role, env, name): # Returns a single AuroraJobKey if one can be found given the args, potentially # querying the scheduler. Calls die() with an appropriate error message otherwise. try: disambiguator = cls(client, role, env, name) except ValueError as e: die(e) if not disambiguator.ambiguous: return AuroraJobKey(client.cluster.name, role, env, name) deprecation_warning( "Job ambiguously specified - querying the scheduler to disambiguate" ) matches = disambiguator.query_matches() if len(matches) == 1: (match, ) = matches log.info("Found job %s" % match) return match elif len(matches) == 0: die("No jobs found") else: die("Multiple jobs match (%s) - disambiguate by using the CLUSTER/ROLE/ENV/NAME form" % ",".join(str(m) for m in matches))
def query(args, options): """usage: query [--shards=N[,N,...]] [--states=State[,State,...]] cluster [role [job]] Query Mesos about jobs and tasks. """ def _convert_fmt_string(fmtstr): import re def convert(match): return "%%(%s)s" % match.group(1) return re.sub(r'%(\w+)%', convert, fmtstr) def flatten_task(t, d={}): for key in t.__dict__.keys(): val = getattr(t, key) try: val.__dict__.keys() except AttributeError: d[key] = val else: flatten_task(val, d) return d def map_values(d): default_value = lambda v: v mapping = { 'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v], } return dict( (k, mapping.get(k, default_value)(v)) for (k, v) in d.items()) for state in options.states.split(','): if state not in ScheduleStatus._NAMES_TO_VALUES: msg = "Unknown state '%s' specified. Valid states are:\n" % state msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys()) die(msg) # Role, Job, Instances, States, and the listformat if len(args) == 0: die('Must specify at least cluster.') cluster = args[0] role = args[1] if len(args) > 1 else None job = args[2] if len(args) > 2 else None instances = set(map( int, options.shards.split(','))) if options.shards else set() if options.states: states = set( map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(','))) else: states = ACTIVE_STATES | TERMINAL_STATES listformat = _convert_fmt_string(options.listformat) # Figure out "expensive" queries here and bone if they do not have --force # - Does not specify role if role is None and not options.force: die('--force is required for expensive queries (no role specified)') # - Does not specify job if job is None and not options.force: die('--force is required for expensive queries (no job specified)') # - Specifies status outside of ACTIVE_STATES if not (states <= ACTIVE_STATES) and not options.force: die('--force is required for expensive queries (states outside ACTIVE states' ) api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity) query_info = api.query( api.build_query(role, job, instances=instances, statuses=states)) tasks = query_info.result.scheduleStatusResult.tasks if query_info.responseCode != ResponseCode.OK: die('Failed to query scheduler: %s' % query_info.message) if tasks is None: return try: for task in tasks: d = flatten_task(task) print(listformat % map_values(d)) except KeyError: msg = "Unknown key in format string. Valid keys are:\n" msg += ','.join(d.keys()) die(msg)
def _validate_health_check_config(config): # TODO(Sathya): Remove this check after health_check_interval_secs deprecation cycle is complete. if config.raw().has_health_check_interval_secs() and config.raw( ).has_health_check_config(): die(HEALTH_CHECK_INTERVAL_SECS_ERROR)
def _validate_health_check_config(config): # TODO(Sathya): Remove this check after health_check_interval_secs deprecation cycle is complete. if config.raw().has_health_check_interval_secs() and config.raw().has_health_check_config(): die(HEALTH_CHECK_INTERVAL_SECS_ERROR)
def query(args, options): """usage: query [--shards=N[,N,...]] [--states=State[,State,...]] cluster [role [job]] Query Mesos about jobs and tasks. """ def _convert_fmt_string(fmtstr): import re def convert(match): return "%%(%s)s" % match.group(1) return re.sub(r"%(\w+)%", convert, fmtstr) def flatten_task(t, d={}): for key in t.__dict__.keys(): val = getattr(t, key) try: val.__dict__.keys() except AttributeError: d[key] = val else: flatten_task(val, d) return d def map_values(d): default_value = lambda v: v mapping = {"status": lambda v: ScheduleStatus._VALUES_TO_NAMES[v]} return dict((k, mapping.get(k, default_value)(v)) for (k, v) in d.items()) for state in options.states.split(","): if state not in ScheduleStatus._NAMES_TO_VALUES: msg = "Unknown state '%s' specified. Valid states are:\n" % state msg += ",".join(ScheduleStatus._NAMES_TO_VALUES.keys()) die(msg) # Role, Job, Instances, States, and the listformat if len(args) == 0: die("Must specify at least cluster.") cluster = args[0] role = args[1] if len(args) > 1 else None job = args[2] if len(args) > 2 else None instances = set(map(int, options.shards.split(","))) if options.shards else set() if options.states: states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(","))) else: states = ACTIVE_STATES | TERMINAL_STATES listformat = _convert_fmt_string(options.listformat) # Figure out "expensive" queries here and bone if they do not have --force # - Does not specify role if role is None and not options.force: die("--force is required for expensive queries (no role specified)") # - Does not specify job if job is None and not options.force: die("--force is required for expensive queries (no job specified)") # - Specifies status outside of ACTIVE_STATES if not (states <= ACTIVE_STATES) and not options.force: die("--force is required for expensive queries (states outside ACTIVE states") api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity) query_info = api.query(api.build_query(role, job, instances=instances, statuses=states)) tasks = query_info.result.scheduleStatusResult.tasks if query_info.responseCode != ResponseCode.OK: die("Failed to query scheduler: %s" % query_info.message) if tasks is None: return try: for task in tasks: d = flatten_task(task) print(listformat % map_values(d)) except KeyError: msg = "Unknown key in format string. Valid keys are:\n" msg += ",".join(d.keys()) die(msg)
def ssh(args, options): """usage: ssh cluster/role/env/job shard [args...] Initiate an SSH session on the machine that a shard is running on. """ if not args: die('Job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) if not args: die('Shard is required') try: shard = int(args.pop(0)) except ValueError: die('Shard must be an integer') api = make_client(cluster_name) resp = api.query(api.build_query(role, name, set([int(shard)]), env=env)) check_and_log_response(resp) first_task = resp.result.scheduleStatusResult.tasks[0] remote_cmd = 'bash' if not args else ' '.join(args) command = DistributedCommandRunner.substitute(remote_cmd, first_task, api.cluster, executor_sandbox=options.executor_sandbox) ssh_command = ['ssh', '-t'] role = first_task.assignedTask.task.owner.role slave_host = first_task.assignedTask.slaveHost for tunnel in options.tunnels: try: port, name = tunnel.split(':') port = int(port) except ValueError: die('Could not parse tunnel: %s. Must be of form PORT:NAME' % tunnel) if name not in first_task.assignedTask.assignedPorts: die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name)) ssh_command += [ '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])] ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command] return subprocess.call(ssh_command)