def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == 'verbose') resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info('Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, quota.numCpus, quota.ramMb, quota.diskMb)) new_cpu = cpu + quota.numCpus new_ram = ram + Amount(quota.ramMb, Data.MB) new_disk = disk + Amount(quota.diskMb, Data.MB) log.info( 'Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB' % (role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB))) resp = client.set_quota(role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) check_and_log_response(resp)
def increase_quota(cluster, role, cpu_str, ram_str, disk_str): """usage: increase_quota cluster role cpu ram[unit] disk[unit] Increases the amount of production quota allocated to a user. """ cpu = float(cpu_str) ram = parse_data(ram_str) disk = parse_data(disk_str) options = app.get_options() client = AuroraClientAPI(CLUSTERS[cluster], options.verbosity == "verbose") resp = client.get_quota(role) quota = resp.result.getQuotaResult.quota log.info( "Current quota for %s:\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, quota.numCpus, quota.ramMb, quota.diskMb) ) new_cpu = cpu + quota.numCpus new_ram = ram + Amount(quota.ramMb, Data.MB) new_disk = disk + Amount(quota.diskMb, Data.MB) log.info( "Attempting to update quota for %s to\n\tCPU\t%s\n\tRAM\t%s MB\n\tDisk\t%s MB" % (role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) ) resp = client.set_quota(role, new_cpu, new_ram.as_(Data.MB), new_disk.as_(Data.MB)) check_and_log_response(resp)
def list_jobs(cluster_and_role): """usage: list_jobs [--show_cron_schedule] cluster/role/env/job""" def show_job_simple(job): if options.show_cron_schedule: print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' + '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job)) else: print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job)) def show_job_pretty(job): print("Job %s/%s/%s/%s:" % (cluster, job.key.role, job.key.environment, job.key.name)) print('\tcron schedule: %s' % job.cronSchedule) print('\tcron policy: %s' % job.cronCollisionPolicy) options = app.get_options() if options.show_cron_schedule and options.pretty: print_fn = show_job_pretty else: print_fn = show_job_simple # Take the cluster_and_role parameter, and split it into its two components. if cluster_and_role.count('/') != 1: die('list_jobs parameter must be in cluster/role format') (cluster,role) = cluster_and_role.split('/') api = make_client(cluster) resp = api.get_jobs(role) check_and_log_response(resp) for job in resp.result.getJobsResult.configs: print_fn(job)
def restart(args, options): """usage: restart cluster/role/env/job [--shards=SHARDS] [--batch_size=INT] [--updater_health_check_interval_seconds=SECONDS] [--max_per_shard_failures=INT] [--max_total_failures=INT] [--restart_threshold=INT] [--watch_secs=SECONDS] Performs a rolling restart of shards within a job. Restarts are fully controlled client-side, so aborting halts the restart. """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None updater_config = UpdaterConfig( options.batch_size, options.restart_threshold, options.watch_secs, options.max_per_shard_failures, options.max_total_failures) resp = api.restart(job_key, options.shards, updater_config, options.health_check_interval_seconds, config=config) check_and_log_response(resp) handle_open(api.scheduler.scheduler().url, job_key.role, job_key.env, job_key.name)
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def scheduler_snapshot(cluster): """usage: scheduler_snapshot cluster Request that the scheduler perform a storage snapshot and block until complete. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS["cluster"], options.verbosity).snapshot())
def _complete_maintenance(self, drained_hosts): """End the maintenance status for a give set of hosts.""" check_and_log_response(self._client.end_maintenance(drained_hosts)) resp = self._client.maintenance_status(drained_hosts) for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.NONE: log.warning('%s is DRAINING or in DRAINED' % host_status.host)
def check_status(self, hosts): resp = self._client.maintenance_status(Hosts(set(hosts))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append((host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses
def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response(AuroraClientAPI(CLUSTERS[cluster], options.verbosity).unload_recovery())
def scheduler_snapshot(cluster): """usage: scheduler_snapshot cluster Request that the scheduler perform a storage snapshot and block until complete. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS['cluster'], options.verbosity).snapshot())
def query_matches(self): resp = self._client.get_jobs(self._role) check_and_log_response(resp) return set( AuroraJobKey(self._client.cluster.name, j.key.role, j.key.environment, j.key.name) for j in resp.result.getJobsResult.configs if j.key.name == self._name)
def scheduler_backup_now(cluster): """usage: scheduler_backup_now cluster Immediately initiates a full storage backup. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).perform_backup())
def scheduler_stage_recovery(cluster, backup_id): """usage: scheduler_stage_recovery cluster backup_id Stages a backup for recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).stage_recovery(backup_id))
def scheduler_unload_recovery(cluster): """usage: scheduler_unload_recovery cluster Unloads a staged recovery. """ options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).unload_recovery())
def check_status(self, hosts): resp = self._client.maintenance_status(Hosts(set(hosts))) check_and_log_response(resp) statuses = [] for host_status in resp.result.maintenanceStatusResult.statuses: statuses.append( (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) return statuses
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(",")) options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).delete_recovery_tasks(TaskQuery(taskIds=ids)) )
def scheduler_delete_recovery_tasks(cluster, task_ids): """usage: scheduler_delete_recovery_tasks cluster task_ids Deletes a comma-separated list of task IDs from a staged recovery. """ ids = set(task_ids.split(',')) options = app.get_options() check_and_log_response( AuroraClientAPI(CLUSTERS[cluster], options.verbosity).delete_recovery_tasks( TaskQuery(taskIds=ids)))
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print("%s available backups:" % len(backups)) for backup in backups: print(backup)
def scheduler_list_backups(cluster): """usage: scheduler_list_backups cluster Lists backups available for recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).list_backups() check_and_log_response(resp) backups = resp.result.listBackupsResult.backups print('%s available backups:' % len(backups)) for backup in backups: print(backup)
def start_cron(args, options): """usage: start_cron cluster/role/env/job Invokes a cron job immediately, out of its normal cron cycle. This does not affect the cron cycle in any way. """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.start_cronjob(job_key, config=config) check_and_log_response(resp) handle_open(api.scheduler.scheduler().url, job_key.role, job_key.env, job_key.name)
def cancel_update(args, options): """usage: cancel_update cluster/role/env/job Unlocks a job for updates. A job may be locked if a client's update session terminated abnormally, or if another user is actively updating the job. This command should only be used when the user is confident that they are not conflicting with another user. """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.cancel_update(job_key, config=config) check_and_log_response(resp)
def scheduler_list_job_updates(cluster): """usage: scheduler_list_job_updates cluster Lists in-flight job updates. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_job_updates() check_and_log_response(resp) print('Role\tEnv\tJob') for update in resp.jobUpdates: print('%s\t%s\t%s' % (update.jobKey.role if update.jobKey else update.roleDeprecated, update.jobKey.environment if update.jobKey else None, update.jobKey.name if update.jobKey else update.jobDeprecated))
def kill(args, options): """usage: kill cluster/role/env/job Kills a running job, blocking until all tasks have terminated. Default behaviour is to kill all shards in the job, but the kill can be limited to specific shards with the --shards option """ api, job_key, config_file = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) options = app.get_options() config = get_job_config(job_key.to_path(), config_file, options) if config_file else None resp = api.kill_job(job_key, options.shards, config=config) check_and_log_response(resp) handle_open(api.scheduler.scheduler().url, job_key.role, job_key.env, job_key.name)
def set_quota(cluster, role, cpu_str, ram_mb_str, disk_mb_str): """usage: set_quota cluster role cpu ramMb diskMb Alters the amount of production quota allocated to a user. """ try: cpu = float(cpu_str) ram_mb = int(ram_mb_str) disk_mb = int(disk_mb_str) except ValueError: log.error("Invalid value") options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).set_quota(role, cpu, ram_mb, disk_mb) check_and_log_response(resp)
def set_quota(cluster, role, cpu_str, ram_mb_str, disk_mb_str): """usage: set_quota cluster role cpu ramMb diskMb Alters the amount of production quota allocated to a user. """ try: cpu = float(cpu_str) ram_mb = int(ram_mb_str) disk_mb = int(disk_mb_str) except ValueError: log.error('Invalid value') options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).set_quota( role, cpu, ram_mb, disk_mb) check_and_log_response(resp)
def _drain_hosts(self, drainable_hosts, clock=time): """This will actively turn down tasks running on hosts.""" check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames] while not_ready_hosts: log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY) clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS)) resp = self._client.maintenance_status(Hosts(not_ready_hosts)) #TODO(jsmith): Workaround until scheduler responds with unknown slaves in MESOS-3454 if not resp.result.maintenanceStatusResult.statuses: not_ready_hosts = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning('%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hosts.remove(host_status.host)
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery( TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info('Role\tJob\tShard\tStatus\tTask ID') for task in resp.tasks: assigned = task.assignedTask conf = assigned.task log.info('\t'.join( (conf.owner.role, conf.jobName, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId)))
def ssh(args, options): """usage: ssh cluster/role/env/job shard [args...] Initiate an SSH session on the machine that a shard is running on. """ if not args: die('Job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) if not args: die('Shard is required') try: shard = int(args.pop(0)) except ValueError: die('Shard must be an integer') api = make_client(cluster_name) resp = api.query(api.build_query(role, name, set([int(shard)]), env=env)) check_and_log_response(resp) first_task = resp.result.scheduleStatusResult.tasks[0] remote_cmd = 'bash' if not args else ' '.join(args) command = DistributedCommandRunner.substitute(remote_cmd, first_task, api.cluster, executor_sandbox=options.executor_sandbox) ssh_command = ['ssh', '-t'] role = first_task.assignedTask.task.owner.role slave_host = first_task.assignedTask.slaveHost for tunnel in options.tunnels: try: port, name = tunnel.split(':') port = int(port) except ValueError: die('Could not parse tunnel: %s. Must be of form PORT:NAME' % tunnel) if name not in first_task.assignedTask.assignedPorts: die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name)) ssh_command += [ '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])] ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command] return subprocess.call(ssh_command)
def scheduler_list_job_updates(cluster): """usage: scheduler_list_job_updates cluster Lists in-flight job updates. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).get_job_updates() check_and_log_response(resp) print("Role\tEnv\tJob") for update in resp.jobUpdates: print( "%s\t%s\t%s" % ( update.jobKey.role if update.jobKey else update.roleDeprecated, update.jobKey.environment if update.jobKey else None, update.jobKey.name if update.jobKey else update.jobDeprecated, ) )
def _drain_hosts(self, drainable_hosts, clock=time): """This will actively turn down tasks running on hosts.""" check_and_log_response(self._client.drain_hosts(drainable_hosts)) not_ready_hosts = [hostname for hostname in drainable_hosts.hostNames] while not_ready_hosts: log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY) clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS)) resp = self._client.maintenance_status(Hosts(not_ready_hosts)) #TODO(jsmith): Workaround until scheduler responds with unknown slaves in MESOS-3454 if not resp.result.maintenanceStatusResult.statuses: not_ready_hosts = None for host_status in resp.result.maintenanceStatusResult.statuses: if host_status.mode != MaintenanceMode.DRAINED: log.warning( '%s is currently in status %s' % (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode])) else: not_ready_hosts.remove(host_status.host)
def update(job_spec, config_file): """usage: update cluster/role/env/job config Performs a rolling upgrade on a running job, using the update configuration within the config file as a control for update velocity and failure tolerance. Updates are fully controlled client-side, so aborting an update halts the update and leaves the job in a 'locked' state on the scheduler. Subsequent update attempts will fail until the update is 'unlocked' using the 'cancel_update' command. The updater only takes action on shards in a job that have changed, meaning that changing a single shard will only induce a restart on the changed shard. You may want to consider using the 'diff' subcommand before updating, to preview what changes will take effect. """ def warn_if_dangerous_change(api, job_spec, config): # Get the current job status, so that we can check if there's anything # dangerous about this update. job_key = AuroraJobKey(config.cluster(), config.role(), config.environment(), config.name()) resp = api.query(api.build_query(config.role(), config.name(), statuses=ACTIVE_STATES, env=config.environment())) if resp.responseCode != ResponseCode.OK: die('Could not get job status from server for comparison: %s' % resp.message) remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks] resp = api.populate_job_config(config) if resp.responseCode != ResponseCode.OK: die('Server could not populate job config for comparison: %s' % resp.message) local_task_count = len(resp.result.populateJobResult.populated) remote_task_count = len(remote_tasks) if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count or local_task_count == 0): print('Warning: this update is a large change. Press ^c within 5 seconds to abort') time.sleep(5) options = app.get_options() config = get_job_config(job_spec, config_file, options) api = make_client(config.cluster()) if not options.force: warn_if_dangerous_change(api, job_spec, config) resp = api.update_job(config, options.health_check_interval_seconds, options.shards) check_and_log_response(resp)
def create(job_spec, config_file): """usage: create cluster/role/env/job config Creates a job based on a configuration file. """ options = app.get_options() try: config = get_job_config(job_spec, config_file, options) except ValueError as v: print("Error: %s" % v) sys.exit(1) api = make_client(config.cluster()) monitor = JobMonitor(api, config.role(), config.environment(), config.name()) resp = api.create_job(config) check_and_log_response(resp) handle_open(api.scheduler.scheduler().url, config.role(), config.environment(), config.name()) if options.wait_until == 'RUNNING': monitor.wait_until(monitor.running_or_finished) elif options.wait_until == 'FINISHED': monitor.wait_until(monitor.terminal)
def scheduler_print_recovery_tasks(cluster): """usage: scheduler_print_recovery_tasks cluster Prints all active tasks in a staged recovery. """ options = app.get_options() resp = AuroraClientAPI(CLUSTERS[cluster], options.verbosity).query_recovery(TaskQuery(statuses=ACTIVE_STATES)) check_and_log_response(resp) log.info("Role\tJob\tShard\tStatus\tTask ID") for task in resp.tasks: assigned = task.assignedTask conf = assigned.task log.info( "\t".join( ( conf.owner.role, conf.jobName, str(assigned.instanceId), ScheduleStatus._VALUES_TO_NAMES[task.status], assigned.taskId, ) ) )
def start_maintenance(self, hosts): """Put a list of hosts into maintenance mode, to de-prioritize scheduling.""" check_and_log_response( self._client.start_maintenance(Hosts(set(hosts))))
def status(args, options): """usage: status cluster/role/env/job Fetches and prints information about the active tasks in a job. """ def is_active(task): return task.status in ACTIVE_STATES def print_task(scheduled_task): assigned_task = scheduled_task.assignedTask taskInfo = assigned_task.task taskString = '' if taskInfo: taskString += '''cpus: %s, ram: %s MB, disk: %s MB''' % (taskInfo.numCpus, taskInfo.ramMb, taskInfo.diskMb) if assigned_task.assignedPorts: taskString += '\n\tports: %s' % assigned_task.assignedPorts taskString += '\n\tfailure count: %s (max %s)' % (scheduled_task.failureCount, taskInfo.maxTaskFailures) taskString += '\n\tevents:' for event in scheduled_task.taskEvents: taskString += '\n\t\t %s %s: %s' % (datetime.fromtimestamp(event.timestamp / 1000), ScheduleStatus._VALUES_TO_NAMES[event.status], event.message) taskString += '\n\tpackages:' for pkg in assigned_task.task.packages: taskString += ('\n\t\trole: %s, package: %s, version: %s' % (pkg.role, pkg.name, pkg.version)) return taskString def print_tasks(tasks): for task in tasks: taskString = print_task(task) log.info('role: %s, env: %s, name: %s, shard: %s, status: %s on %s\n%s' % (task.assignedTask.task.owner.role, task.assignedTask.task.environment, task.assignedTask.task.jobName, task.assignedTask.instanceId, ScheduleStatus._VALUES_TO_NAMES[task.status], task.assignedTask.slaveHost, taskString)) for pkg in task.assignedTask.task.packages: log.info('\tpackage %s/%s/%s' % (pkg.role, pkg.name, pkg.version)) api, job_key, _ = LiveJobDisambiguator.disambiguate_args_or_die( args, options, make_client_factory()) resp = api.check_status(job_key) check_and_log_response(resp) tasks = resp.result.scheduleStatusResult.tasks if tasks: active_tasks = filter(is_active, tasks) log.info('Active Tasks (%s)' % len(active_tasks)) print_tasks(active_tasks) inactive_tasks = filter(lambda x: not is_active(x), tasks) log.info('Inactive Tasks (%s)' % len(inactive_tasks)) print_tasks(inactive_tasks) else: log.info('No tasks found.')
def start_maintenance(self, hosts): """Put a list of hosts into maintenance mode, to de-prioritize scheduling.""" check_and_log_response(self._client.start_maintenance(Hosts(set(hosts))))
def query_matches(self): resp = self._client.get_jobs(self._role) check_and_log_response(resp) return set(AuroraJobKey(self._client.cluster.name, j.key.role, j.key.environment, j.key.name) for j in resp.result.getJobsResult.configs if j.key.name == self._name)