Пример #1
0
def read_commands_from_stdin():
    """Prompts for and then reads commands, one per line, from stdin"""
    print_info('Enter the commands, one per line (press Ctrl+D on a blank line to submit)')
    commands = read_lines()
    if len(commands) < 1:
        raise Exception('You must specify at least one command.')
    return commands
Пример #2
0
def submit_federated(clusters, jobs, group, pool):
    """
    Attempts to submit the provided jobs to each cluster in clusters, until a cluster
    returns a "created" status code. If no cluster returns "created" status, throws.
    """
    messages = ""
    for cluster in clusters:
        cluster_name = cluster['name']
        cluster_url = cluster['url']
        try:
            print_info('Attempting to submit on %s cluster...' % terminal.bold(cluster_name))

            json_body = {'jobs': jobs}
            if group:
                json_body['groups'] = [group]
            if pool:
                json_body['pool'] = pool

            resp = http.post(cluster, 'jobs', json_body)
            print_submit_result(cluster, resp)
            if resp.status_code == 201:
                metrics.inc('command.submit.jobs', len(jobs))
                return 0
        except requests.exceptions.ReadTimeout as rt:
            logging.exception(rt)
            print_info(terminal.failed(
                f'Encountered read timeout with {cluster_name} ({cluster_url}). Your submission may have completed.'))
            return 1
        except IOError as ioe:
            logging.exception(ioe)
            reason = f'Cannot connect to {cluster_name} ({cluster_url})'
            message = submit_failed_message(cluster_name, reason)
            messages += message
    print_error(messages)
    raise Exception(terminal.failed('Job submission failed on all of your configured clusters.'))
Пример #3
0
def print_submit_result(cluster, response):
    """
    Parses a submission response from cluster and returns a corresponding message. Note that
    Cook Scheduler returns text when the submission was successful, and JSON when the submission
    failed. Also, in the case of failure, there are different possible shapes for the failure payload.
    """
    cluster_name = cluster['name']
    if response.status_code == 201:
        text = response.text.strip('"')
        if ' submitted groups' in text:
            group_index = text.index(' submitted groups')
            text = text[:group_index]
        uuids = [p for p in text.split() if is_valid_uuid(p)]
        print_info(submit_succeeded_message(cluster_name, uuids),
                   '\n'.join(uuids))
    else:
        try:
            data = response.json()
            if 'errors' in data:
                reason = json.dumps(data['errors'])
            elif 'error' in data:
                reason = data['error']
            else:
                reason = json.dumps(data)
        except json.decoder.JSONDecodeError:
            reason = '%s\n' % response.text
        print_info(submit_failed_message(cluster_name, reason))
Пример #4
0
def ssh_to_instance(job,
                    instance,
                    sandbox_dir_fn,
                    cluster,
                    command_to_run=None):
    """
    When using Mesos, attempts to ssh (using os.execlp) to the Mesos agent corresponding to the given instance.
    When using Kubernetes, calls the exec command of the kubectl cli.
    """
    print_info(
        f'Attempting ssh for job instance {terminal.bold(instance["task_id"])}...'
    )
    compute_cluster = instance["compute-cluster"]
    compute_cluster_type = compute_cluster["type"]
    compute_cluster_name = compute_cluster["name"]
    if compute_cluster_type == "kubernetes":
        kubectl_exec_to_instance_fn = plugins.get_fn(
            'kubectl-exec-to-instance', kubectl_exec_to_instance)
        compute_cluster_config = get_compute_cluster_config(
            cluster, compute_cluster_name)
        kubectl_exec_to_instance_fn(job["user"], instance["task_id"],
                                    compute_cluster_config, command_to_run)
    else:
        command_to_run = command_to_run or ['bash']
        sandbox_dir = sandbox_dir_fn()
        command = os.environ.get('CS_SSH', 'ssh')
        logging.info(f'using ssh command: {command}')
        hostname = instance['hostname']
        print_info(f'Executing ssh to {terminal.bold(hostname)}.')
        args = ['ssh', '-t', hostname, 'cd', sandbox_dir, ';'] + command_to_run
        os.execlp(command, *args)
Пример #5
0
def query_with_stdin_support(clusters, entity_refs, pred_jobs=None, pred_instances=None,
                             pred_groups=None, timeout=None, interval=None):
    """
    Queries for UUIDs across clusters, supporting input being passed via stdin, e.g.:

      $ cs jobs --user sally --running --waiting -1 | cs wait

    The above example would wait for all of sally's running and waiting jobs to complete. Returns a pair where the
    first element is the query result map, and the second element is the subset of clusters that are of interest.
    """
    is_stdin_from_pipe = not sys.stdin.isatty()
    text_read_from_pipe = sys.stdin.read() if is_stdin_from_pipe else None

    if entity_refs and text_read_from_pipe:
        raise Exception(f'You cannot supply entity references both as arguments and from stdin.')

    clusters_of_interest = clusters
    if not entity_refs:
        if is_stdin_from_pipe:
            text = text_read_from_pipe
        else:
            print_info('Enter the UUIDs or URLs, one per line (press Ctrl+D on a blank line to submit)')
            text = sys.stdin.read()

        if not text:
            raise Exception('You must specify at least one UUID or URL.')

        ref_strings = text.splitlines()
        entity_refs, clusters_of_interest = parse_entity_refs(clusters, ref_strings)

    query_result = query(clusters_of_interest, entity_refs, pred_jobs, pred_instances, pred_groups, timeout, interval)
    return query_result, clusters_of_interest
Пример #6
0
Файл: ssh.py Проект: yueri/Cook
def ssh_to_instance(instance, sandbox_dir):
    """Attempts to ssh (using os.execlp) to the Mesos agent corresponding to the given instance."""
    print_info(f'Attempting ssh for job instance {terminal.bold(instance["task_id"])}...')
    command = os.environ.get('CS_SSH', 'ssh')
    logging.info(f'using ssh command: {command}')
    hostname = instance['hostname']
    print_info(f'Executing ssh to {terminal.bold(hostname)}.')
    os.execlp(command, 'ssh', '-t', hostname, f'cd "{sandbox_dir}" ; bash')
Пример #7
0
def __print_state(lines_to_move_up):
    """
    "Refreshes" the state on the terminal by moving the cursor up
    lines_to_move_up lines and then printing the current state of the data
    list, which contains [item, status] pairs.
    """
    print_info(terminal.MOVE_UP * lines_to_move_up, end='')
    print_info('\n'.join([f'{item} ... {state}' for [item, state] in data]))
Пример #8
0
def print_state(lines_to_erase):
    """
    "Refreshes" the state on the terminal by moving the cursor up
    lines_to_erase lines and then printing the current state of the data
    list, which contains [item, status] pairs.
    """
    # term.height can be None, for example, when running in a subprocess
    if term.height:
        with term.location(0, term.height - lines_to_erase - 1):
            state_text = '\n'.join([('%s ... %s' % (i, s)) for [i, s] in data])
            print_info(state_text)
Пример #9
0
def set_config_value(config_map, keys, value, config_path):
    """Attempts to set the config entry at the location specified by keys to value"""
    if is_int(value):
        value = int(value)
    elif is_float(value):
        value = float(value)
    elif value.lower() == 'true':
        value = True
    elif value.lower() == 'false':
        value = False

    set_in(config_map, keys, value)
    print_info(f'Updating configuration in {terminal.bold(config_path)}.')
    configuration.save_config(config_path, config_map)
    return 0
Пример #10
0
Файл: wait.py Проект: m4ce/Cook
def wait(clusters, args):
    """Waits for jobs / instances / groups with the given UUIDs to complete."""
    timeout = args.get('timeout')
    interval = args.get('interval')
    uuids = strip_all(args.get('uuid'))
    timeout_text = (
        'up to %s' %
        seconds_to_timedelta(timeout)) if timeout else 'indefinitely'
    print_info('Will wait %s.' % timeout_text)
    query_result = query(clusters, uuids, all_jobs_completed,
                         all_instances_completed, all_groups_completed,
                         timeout, interval)
    if query_result['count'] > 0:
        return 0
    else:
        print_no_data(clusters)
        return 1
Пример #11
0
def print_as_table(query_result):
    """Given a collection of (cluster, job) pairs, formats a table showing the most relevant job fields"""
    cluster_job_pairs = query_result_to_cluster_job_pairs(query_result)
    rows = [
        collections.OrderedDict([("Cluster", cluster), ("UUID", job['uuid']),
                                 ("Name", job['name']),
                                 ("Memory", format_job_memory(job)),
                                 ("CPUs", job['cpus']),
                                 ("Priority", job['priority']),
                                 ("Attempts", format_job_attempts(job)),
                                 ("Submitted",
                                  millis_to_date_string(job['submit_time'])),
                                 ("Command", format_job_command(job)),
                                 ("Job Status", format_job_status(job))])
        for (cluster, job) in cluster_job_pairs
    ]
    job_table = tabulate(rows, headers='keys', tablefmt='plain')
    print_info(job_table)
Пример #12
0
def wait(clusters, args, _):
    """Waits for jobs / instances / groups with the given UUIDs to complete."""
    guard_no_cluster(clusters)
    timeout = args.get('timeout')
    interval = args.get('interval')
    entity_refs, _ = parse_entity_refs(clusters, args.get('uuid'))
    timeout_text = (
        'up to %s' %
        seconds_to_timedelta(timeout)) if timeout else 'indefinitely'
    print_info('Will wait %s.' % timeout_text)
    query_result, clusters_of_interest = query_with_stdin_support(
        clusters, entity_refs, all_jobs_completed, all_instances_completed,
        all_groups_completed, timeout, interval)
    if query_result['count'] > 0:
        return 0
    else:
        print_no_data(clusters_of_interest)
        return 1
Пример #13
0
def kill_entities(query_result, clusters):
    """Attempts to kill the jobs / instances / groups with the given UUIDs"""
    kill_batch_size = 100
    failed = []
    succeeded = []
    clusters_by_name = {c['name']: c for c in clusters}

    def __kill(cluster, uuids, kill_fn, entity_type):
        if len(uuids) > 0:
            for uuid_batch in partition(uuids, kill_batch_size):
                success = kill_fn(cluster, uuid_batch)
                batch = [{
                    'cluster': cluster,
                    'type': entity_type,
                    'uuid': u
                } for u in uuid_batch]
                (succeeded if success else failed).extend(batch)

    for cluster_name, entities in query_result['clusters'].items():
        cluster = clusters_by_name[cluster_name]
        job_uuids = [j['uuid']
                     for j in entities['jobs']] if 'jobs' in entities else []
        instance_uuids = [i['task_id'] for i, _ in entities['instances']
                          ] if 'instances' in entities else []
        group_uuids = [g['uuid'] for g in entities['groups']
                       ] if 'groups' in entities else []
        __kill(cluster, job_uuids, kill_jobs, 'job')
        __kill(cluster, instance_uuids, kill_instances, 'job instance')
        __kill(cluster, group_uuids, kill_groups, 'job group')

    for item in succeeded:
        print_info(
            f'Killed {item["type"]} {colors.bold(item["uuid"])} on {colors.bold(item["cluster"]["name"])}.'
        )
    for item in failed:
        print(
            colors.failed(
                f'Failed to kill {item["type"]} {item["uuid"]} on {item["cluster"]["name"]}.'
            ))
    num_succeeded = len(succeeded)
    num_failed = len(failed)
    print_info(f'Successful: {num_succeeded}, Failed: {num_failed}')
    return num_failed
Пример #14
0
def submit_federated(clusters, jobs):
    """
    Attempts to submit the provided jobs to each cluster in clusters, until a cluster
    returns a "created" status code. If no cluster returns "created" status, throws.
    """
    for cluster in clusters:
        cluster_name = cluster['name']
        try:
            print_info('Attempting to submit on %s cluster...' %
                       colors.bold(cluster_name))
            resp = http.post(cluster, 'rawscheduler', {'jobs': jobs})
            print_submit_result(cluster, resp)
            if resp.status_code == 201:
                metrics.inc('command.submit.jobs', len(jobs))
                return 0
        except IOError as ioe:
            logging.info(ioe)
            reason = 'Cannot connect to %s (%s)' % (cluster_name,
                                                    cluster['url'])
            print_info('%s\n' % submit_failed_message(cluster_name, reason))
    raise Exception(
        colors.failed(
            'Job submission failed on all of your configured clusters.'))
Пример #15
0
def ssh_to_instance(job,
                    instance,
                    sandbox_dir_fn,
                    cluster,
                    command_to_run=None):
    """
    When using Mesos, attempts to ssh (using os.execlp) to the Mesos agent corresponding to the given instance.
    When using Kubernetes, calls the exec command of the kubectl cli.
    """
    compute_cluster = instance['compute-cluster']
    compute_cluster_type = compute_cluster['type']
    instance_status = instance['status']
    instance_uuid = instance['task_id']

    if compute_cluster_type == 'kubernetes':
        if instance_status == 'unknown':
            print_info(
                f'Job instance {terminal.bold(instance_uuid)} is not yet running.'
            )
            return
        elif instance_status == 'success' or instance_status == 'failed':
            cs_command = 'cs'
            print_info(
                f'Job instance {terminal.bold(instance_uuid)} already completed, so you cannot ssh to it.'
            )
            print_info('')
            print_info(
                'To inspect individual files, e.g. stdout, try one of these:')
            print_info('')
            print_info(f'{cs_command} cat {instance_uuid} stdout')
            print_info(f'{cs_command} tail {instance_uuid} stdout')
            print_info('')
            print_info('To retrieve the entire output directory, try:')
            print_info('')
            print_info(f'{cs_command} download {instance_uuid}')
            print_info('')
            print_info(f'Here are the results of running {cs_command} ls:')
            print_info('')
            print_info(f'{cs_command} ls -l {instance_uuid}')
            args = {
                'json': False,
                'literal': False,
                'long_format': True,
                'path': None,
                'uuid': [instance_uuid]
            }
            ls([cluster], args, _=None)
            return

    print_info(
        f'Attempting ssh for job instance {terminal.bold(instance_uuid)}...')
    compute_cluster_name = compute_cluster['name']
    if compute_cluster_type == 'kubernetes':
        kubectl_exec_to_instance_fn = plugins.get_fn(
            'kubectl-exec-to-instance', kubectl_exec_to_instance)
        compute_cluster_config = get_compute_cluster_config(
            cluster, compute_cluster_name)
        kubectl_exec_to_instance_fn(job['user'], instance_uuid,
                                    compute_cluster_config, command_to_run)
    else:
        command_to_run = command_to_run or ['bash']
        sandbox_dir = sandbox_dir_fn()
        command = os.environ.get('CS_SSH', 'ssh')
        logging.info(f'using ssh command: {command}')
        hostname = instance['hostname']
        print_info(f'Executing ssh to {terminal.bold(hostname)}.')
        args = ['ssh', '-t', hostname, 'cd', sandbox_dir, ';'] + command_to_run
        os.execlp(command, *args)
Пример #16
0
def print_formatted_cluster_or_pool_usage(cluster_or_pool,
                                          cluster_or_pool_usage):
    """Prints the query result for a cluster or pool in a cluster as a hierarchical set of bullets"""
    usage_map = cluster_or_pool_usage['usage']
    share_map = cluster_or_pool_usage['share']
    quota_map = cluster_or_pool_usage['quota']
    print_info(terminal.bold(cluster_or_pool))

    format_limit = lambda limit, formatter=(lambda x: x): \
        'Unlimited' if limit == sys.float_info.max else formatter(limit)

    rows = [[
        'Max Quota',
        format_limit(quota_map['cpus']),
        format_limit(quota_map['mem'], format_memory_amount),
        format_limit(quota_map['gpus']),
        'Unlimited' if quota_map['count'] == (2**31 -
                                              1) else quota_map['count']
    ],
            [
                'Non-preemptible Share',
                format_limit(share_map['cpus']),
                format_limit(share_map['mem'], format_memory_amount),
                format_limit(share_map['gpus']), 'N/A'
            ],
            [
                'Current Usage', usage_map['cpus'],
                format_job_memory(usage_map), usage_map['gpus'],
                usage_map['jobs']
            ]]
    print_info(
        tabulate(rows,
                 headers=['', 'CPUs', 'Memory', 'GPUs', 'Jobs'],
                 tablefmt='plain'))

    applications = cluster_or_pool_usage['applications']
    if applications:
        print_info('Applications:')
    for application, application_usage in applications.items():
        usage_map = application_usage['usage']
        print_info(
            f'- {terminal.running(application if application else "[no application defined]")}'
        )
        print_info(f'  {format_usage(usage_map)}')
        print_info('  Job Groups:')
        for group, group_usage in application_usage['groups'].items():
            usage_map = group_usage['usage']
            jobs = group_usage['jobs']
            print_info(
                f'\t- {terminal.bold(group if group else "[ungrouped]")}')
            print_info(f'\t  {format_usage(usage_map)}')
            print_info(f'\t  Jobs: {len(jobs)}')
            print_info('')
    print_info('')
Пример #17
0
def print_no_data(clusters):
    """Prints a message indicating that no data was found in the given clusters"""
    clusters_text = ' / '.join([c['name'] for c in clusters])
    print(colors.failed('No matching data found in %s.' % clusters_text))
    print_info('Do you need to add another cluster to your configuration?')
Пример #18
0
def print_formatted_cluster_or_pool_usage(cluster_or_pool,
                                          cluster_or_pool_usage):
    """Prints the query result for a cluster or pool in a cluster as a hierarchical set of bullets"""
    usage_map = cluster_or_pool_usage['usage']
    share_map = cluster_or_pool_usage['share']
    print_info(colors.bold(cluster_or_pool))
    print_info(format_share(share_map))
    print_info(format_usage(usage_map))
    applications = cluster_or_pool_usage['applications']
    if applications:
        print_info('Applications:')
    else:
        print_info(colors.waiting('Nothing Running'))
    for application, application_usage in applications.items():
        usage_map = application_usage['usage']
        print_info(
            f'- {colors.running(application if application else "[no application defined]")}'
        )
        print_info(f'  {format_usage(usage_map)}')
        print_info('  Job Groups:')
        for group, group_usage in application_usage['groups'].items():
            usage_map = group_usage['usage']
            jobs = group_usage['jobs']
            print_info(f'\t- {colors.bold(group if group else "[ungrouped]")}')
            print_info(f'\t  {format_usage(usage_map)}')
            print_info(f'\t  Jobs: {len(jobs)}')
            print_info('')
    print_info('')