def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == 'verbose') resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, quota.numCpus, quota.ramMb, quota.diskMb)) new_cpu = cpu + quota.numCpus new_ram = ram + Amount(quota.ramMb, Data.MB) new_disk = disk + Amount(quota.diskMb, Data.MB) log.info( 'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB))) resp = client.set_quota(role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) check_and_log_response(resp)
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose") resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info( "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, quota.numCpus, quota.ramMb, quota.diskMb) ) new_cpu = cpu + quota.numCpus new_ram = ram + Amount(quota.ramMb, Data.MB) new_disk = disk + Amount(quota.diskMb, Data.MB) log.info( "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) ) resp = client.set_quota(role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) check_and_log_response(resp)
def __init__(self, cluster, role, env, jobs, ssh_user=None): self._cluster = cluster self._api = AuroraClientAPI(cluster=cluster) self._role = role self._env = env self._jobs = jobs self._ssh_user = ssh_user if ssh_user else self._role
def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
def scheduler_snapshot(cluster): """usage: scheduler_snapshot cluster Request that the scheduler perform a storage snapshot and block until complete. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS['cluster'], options.verbosity).snapshot())
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).unload_recovery())
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(',')) options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).delete_recovery_tasks( TaskQuery(taskIds=ids)))
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print('%s available backups:' % len(backups)) for backup in backups: print(backup)
def scheduler_list_job_updates(cluster): """usage: scheduler_list_job_updates cluster Lists in-flight job updates. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_job_updates() check_and_log_response(resp) print('Role\tEnv\tJob') for update in resp.jobUpdates: print('%s\t%s\t%s' % (update.jobKey.role if update.jobKey else update.roleDeprecated, update.jobKey.environment if update.jobKey else None, update.jobKey.name if update.jobKey else update.jobDeprecated))
def set_quota(cluster, role, cpu_str, ram_mb_str, disk_mb_str): """usage: set_quota cluster role cpu ramMb diskMb Alters the amount of production quota allocated to a user. """ try: cpu = float(cpu_str) ram_mb = int(ram_mb_str) disk_mb = int(disk_mb_str) except ValueError: log.error('Invalid value') options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).set_quota( role, cpu, ram_mb, disk_mb) check_and_log_response(resp)
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery( TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info('Role\tJob\tShard\tStatus\tTask ID') for task in resp.tasks: assigned = task.assignedTask conf = assigned.task log.info('\t'.join( (conf.owner.role, conf.jobName, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))
def __init__(self, cluster, verbosity): self._client = AuroraClientAPI(cluster, verbosity == 'verbose')
class MesosMaintenance(object): """This class provides more methods to interact with the mesos cluster and perform maintenance. """ DEFAULT_GROUPING = 'by_host' GROUPING_FUNCTIONS = { 'by_host': group_by_host, } START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS) @classmethod def group_hosts(cls, hostnames, grouping_function=DEFAULT_GROUPING): try: grouping_function = cls.GROUPING_FUNCTIONS[grouping_function] except KeyError: raise ValueError('Unknown grouping function %s!' % grouping_function) groups = defaultdict(set) for hostname in hostnames: groups[grouping_function(hostname)].add(hostname) return groups @classmethod def iter_batches(cls, hostnames, batch_size, grouping_function=DEFAULT_GROUPING): if batch_size <= 0: raise ValueError('Batch size must be > 0!') groups = cls.group_hosts(hostnames, grouping_function) groups = sorted(groups.items(), key=lambda v: v[0]) for k in range(0, len(groups), batch_size): yield Hosts(set.union(*(hostset for (key, hostset) in groups[k:k+batch_size]))) def __init__(self, cluster, verbosity): self._client = AuroraClientAPI(cluster, verbosity == 'verbose') def _drain_hosts(self, drainable_hosts, clock=time): """This will actively turn down tasks running on hosts.""" check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames] while not_ready_hosts: log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY) clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS)) resp = self._client.maintenance_status(Hosts(not_ready_hosts)) #TODO(jsmith): Workaround until scheduler responds with unknown slaves in MESOS-3454 if not resp.result.maintenanceStatusResult.statuses: not_ready_hosts = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning('%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hosts.remove(host_status.host) def _complete_maintenance(self, drained_hosts): """End the maintenance status for a give set of hosts.""" check_and_log_response(self._client.end_maintenance(drained_hosts)) resp = self._client.maintenance_status(drained_hosts) for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.NONE: log.warning('%s is DRAINING or in DRAINED' % host_status.host) def _operate_on_hosts(self, drained_hosts, callback): """Perform a given operation on a list of hosts that are ready for maintenance.""" for host in drained_hosts.hostNames: callback(host) def end_maintenance(self, hosts): """Pull a list of hosts out of maintenance mode.""" self._complete_maintenance(Hosts(set(hosts))) def start_maintenance(self, hosts): """Put a list of hosts into maintenance mode, to de-prioritize scheduling.""" check_and_log_response(self._client.start_maintenance(Hosts(set(hosts)))) def perform_maintenance(self, hosts, batch_size=1, grouping_function=DEFAULT_GROUPING, callback=None): """The wrap a callback in between sending hosts into maintenance mode and back. Walk through the process of putting hosts into maintenance, draining them of tasks, performing an action on them once drained, then removing them from maintenance mode so tasks can schedule. """ self._complete_maintenance(Hosts(set(hosts))) self.start_maintenance(hosts) for hosts in self.iter_batches(hosts, batch_size, grouping_function): self._drain_hosts(hosts) if callback: self._operate_on_hosts(hosts, callback) self._complete_maintenance(hosts) def check_status(self, hosts): resp = self._client.maintenance_status(Hosts(set(hosts))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses
def query(args, options): """usage: query [--shards=N[,N,...]] [--states=State[,State,...]] cluster [role [job]] Query Mesos about jobs and tasks. """ def _convert_fmt_string(fmtstr): import re def convert(match): return "%%(%s)s" % match.group(1) return re.sub(r"%(\w+)%", convert, fmtstr) def flatten_task(t, d={}): for key in t.__dict__.keys(): val = getattr(t, key) try: val.__dict__.keys() except AttributeError: d[key] = val else: flatten_task(val, d) return d def map_values(d): default_value = lambda v: v mapping = {"status": lambda v: ScheduleStatus._VALUES_TO_NAMES[v]} return dict((k, mapping.get(k, default_value)(v)) for (k, v) in d.items()) for state in options.states.split(","): if state not in ScheduleStatus._NAMES_TO_VALUES: msg = "Unknown state '%s' specified. Valid states are:\n" % state msg += ",".join(ScheduleStatus._NAMES_TO_VALUES.keys()) die(msg) # Role, Job, Instances, States, and the listformat if len(args) == 0: die("Must specify at least cluster.") cluster = args[0] role = args[1] if len(args) > 1 else None job = args[2] if len(args) > 2 else None instances = set(map(int, options.shards.split(","))) if options.shards else set() if options.states: states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(","))) else: states = ACTIVE_STATES | TERMINAL_STATES listformat = _convert_fmt_string(options.listformat) # Figure out "expensive" queries here and bone if they do not have --force # - Does not specify role if role is None and not options.force: die("--force is required for expensive queries (no role specified)") # - Does not specify job if job is None and not options.force: die("--force is required for expensive queries (no job specified)") # - Specifies status outside of ACTIVE_STATES if not (states <= ACTIVE_STATES) and not options.force: die("--force is required for expensive queries (states outside ACTIVE states") api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity) query_info = api.query(api.build_query(role, job, instances=instances, statuses=states)) tasks = query_info.result.scheduleStatusResult.tasks if query_info.responseCode != ResponseCode.OK: die("Failed to query scheduler: %s" % query_info.message) if tasks is None: return try: for task in tasks: d = flatten_task(task) print(listformat % map_values(d)) except KeyError: msg = "Unknown key in format string. Valid keys are:\n" msg += ",".join(d.keys()) die(msg)
class DistributedCommandRunner(object): @staticmethod def execute(args): hostname, role, command = args ssh_command = ['ssh', '-n', '-q', '%s@%s' % (role, hostname), command] po = subprocess.Popen(ssh_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = po.communicate() return '\n'.join('%s: %s' % (hostname, line) for line in output[0].splitlines()) @classmethod def make_executor_path(cls, cluster, executor_name): parameters = cls.sandbox_args(cluster) parameters.update(executor_name=executor_name) return posixpath.join( '%(slave_root)s', 'slaves/*/frameworks/*/executors/%(executor_name)s/runs', '%(slave_run_directory)s') % parameters @classmethod def thermos_sandbox(cls, cluster, executor_sandbox=False): sandbox = cls.make_executor_path(cluster, 'thermos-{{thermos.task_id}}') return sandbox if executor_sandbox else posixpath.join( sandbox, 'sandbox') @classmethod def sandbox_args(cls, cluster): cluster = cluster.with_trait(CommandRunnerTrait) return { 'slave_root': cluster.slave_root, 'slave_run_directory': cluster.slave_run_directory } @classmethod def substitute_thermos(cls, command, task, cluster, **kw): prefix_command = 'cd %s;' % cls.thermos_sandbox(cluster, **kw) thermos_namespace = ThermosContext( task_id=task.assignedTask.taskId, ports=task.assignedTask.assignedPorts) mesos_namespace = MesosContext(instance=task.assignedTask.instanceId) command = String(prefix_command + command) % Environment( thermos=thermos_namespace, mesos=mesos_namespace) return command.get() @classmethod def aurora_sandbox(cls, cluster, executor_sandbox=False): if executor_sandbox: return cls.make_executor_path(cluster, 'twitter') else: return '/var/run/nexus/%task_id%/sandbox' @classmethod def substitute_aurora(cls, command, task, cluster, **kw): command = ('cd %s;' % cls.aurora_sandbox(cluster, **kw)) + command command = command.replace('%shard_id%', str(task.assignedTask.instanceId)) command = command.replace('%task_id%', task.assignedTask.taskId) for name, port in task.assignedTask.assignedPorts.items(): command = command.replace('%port:' + name + '%', str(port)) return command @classmethod def substitute(cls, command, task, cluster, **kw): if task.assignedTask.task.executorConfig: return cls.substitute_thermos(command, task, cluster, **kw) else: return cls.substitute_aurora(command, task, cluster, **kw) @classmethod def query_from(cls, role, env, job): return TaskQuery(statuses=LIVE_STATES, owner=Identity(role), jobName=job, environment=env) def __init__(self, cluster, role, env, jobs, ssh_user=None): self._cluster = cluster self._api = AuroraClientAPI(cluster=cluster) self._role = role self._env = env self._jobs = jobs self._ssh_user = ssh_user if ssh_user else self._role def resolve(self): for job in self._jobs: resp = self._api.query(self.query_from(self._role, self._env, job)) if resp.responseCode != ResponseCode.OK: log.error('Failed to query job: %s' % job) continue for task in resp.result.scheduleStatusResult.tasks: yield task def process_arguments(self, command, **kw): for task in self.resolve(): host = task.assignedTask.slaveHost role = task.assignedTask.task.owner.role yield (host, self._ssh_user, self.substitute(command, task, self._cluster, **kw)) def run(self, command, parallelism=1, **kw): threadpool = ThreadPool(processes=parallelism) for result in threadpool.imap_unordered( self.execute, self.process_arguments(command, **kw)): print result
def query(args, options): """usage: query [--shards=N[,N,...]] [--states=State[,State,...]] cluster [role [job]] Query Mesos about jobs and tasks. """ def _convert_fmt_string(fmtstr): import re def convert(match): return "%%(%s)s" % match.group(1) return re.sub(r'%(\w+)%', convert, fmtstr) def flatten_task(t, d={}): for key in t.__dict__.keys(): val = getattr(t, key) try: val.__dict__.keys() except AttributeError: d[key] = val else: flatten_task(val, d) return d def map_values(d): default_value = lambda v: v mapping = { 'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v], } return dict( (k, mapping.get(k, default_value)(v)) for (k, v) in d.items()) for state in options.states.split(','): if state not in ScheduleStatus._NAMES_TO_VALUES: msg = "Unknown state '%s' specified. Valid states are:\n" % state msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys()) die(msg) # Role, Job, Instances, States, and the listformat if len(args) == 0: die('Must specify at least cluster.') cluster = args[0] role = args[1] if len(args) > 1 else None job = args[2] if len(args) > 2 else None instances = set(map( int, options.shards.split(','))) if options.shards else set() if options.states: states = set( map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(','))) else: states = ACTIVE_STATES | TERMINAL_STATES listformat = _convert_fmt_string(options.listformat) # Figure out "expensive" queries here and bone if they do not have --force # - Does not specify role if role is None and not options.force: die('--force is required for expensive queries (no role specified)') # - Does not specify job if job is None and not options.force: die('--force is required for expensive queries (no job specified)') # - Specifies status outside of ACTIVE_STATES if not (states <= ACTIVE_STATES) and not options.force: die('--force is required for expensive queries (states outside ACTIVE states' ) api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity) query_info = api.query( api.build_query(role, job, instances=instances, statuses=states)) tasks = query_info.result.scheduleStatusResult.tasks if query_info.responseCode != ResponseCode.OK: die('Failed to query scheduler: %s' % query_info.message) if tasks is None: return try: for task in tasks: d = flatten_task(task) print(listformat % map_values(d)) except KeyError: msg = "Unknown key in format string. Valid keys are:\n" msg += ','.join(d.keys()) die(msg)
class MesosMaintenance(object): """This class provides more methods to interact with the mesos cluster and perform maintenance. """ DEFAULT_GROUPING = 'by_host' GROUPING_FUNCTIONS = { 'by_host': group_by_host, } START_MAINTENANCE_DELAY = Amount(30, Time.SECONDS) @classmethod def group_hosts(cls, hostnames, grouping_function=DEFAULT_GROUPING): try: grouping_function = cls.GROUPING_FUNCTIONS[grouping_function] except KeyError: raise ValueError('Unknown grouping function %s!' % grouping_function) groups = defaultdict(set) for hostname in hostnames: groups[grouping_function(hostname)].add(hostname) return groups @classmethod def iter_batches(cls, hostnames, batch_size, grouping_function=DEFAULT_GROUPING): if batch_size <= 0: raise ValueError('Batch size must be > 0!') groups = cls.group_hosts(hostnames, grouping_function) groups = sorted(groups.items(), key=lambda v: v[0]) for k in range(0, len(groups), batch_size): yield Hosts( set.union(*(hostset for (key, hostset) in groups[k:k + batch_size]))) def __init__(self, cluster, verbosity): self._client = AuroraClientAPI(cluster, verbosity == 'verbose') def _drain_hosts(self, drainable_hosts, clock=time): """This will actively turn down tasks running on hosts.""" check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames] while not_ready_hosts: log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY) clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS)) resp = self._client.maintenance_status(Hosts(not_ready_hosts)) #TODO(jsmith): Workaround until scheduler responds with unknown slaves in MESOS-3454 if not resp.result.maintenanceStatusResult.statuses: not_ready_hosts = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning( '%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hosts.remove(host_status.host) def _complete_maintenance(self, drained_hosts): """End the maintenance status for a give set of hosts.""" check_and_log_response(self._client.end_maintenance(drained_hosts)) resp = self._client.maintenance_status(drained_hosts) for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.NONE: log.warning('%s is DRAINING or in DRAINED' % host_status.host) def _operate_on_hosts(self, drained_hosts, callback): """Perform a given operation on a list of hosts that are ready for maintenance.""" for host in drained_hosts.hostNames: callback(host) def end_maintenance(self, hosts): """Pull a list of hosts out of maintenance mode.""" self._complete_maintenance(Hosts(set(hosts))) def start_maintenance(self, hosts): """Put a list of hosts into maintenance mode, to de-prioritize scheduling.""" check_and_log_response( self._client.start_maintenance(Hosts(set(hosts)))) def perform_maintenance(self, hosts, batch_size=1, grouping_function=DEFAULT_GROUPING, callback=None): """The wrap a callback in between sending hosts into maintenance mode and back. Walk through the process of putting hosts into maintenance, draining them of tasks, performing an action on them once drained, then removing them from maintenance mode so tasks can schedule. """ self._complete_maintenance(Hosts(set(hosts))) self.start_maintenance(hosts) for hosts in self.iter_batches(hosts, batch_size, grouping_function): self._drain_hosts(hosts) if callback: self._operate_on_hosts(hosts, callback) self._complete_maintenance(hosts) def check_status(self, hosts): resp = self._client.maintenance_status(Hosts(set(hosts))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append( (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses