예제 #1
0
def main(arg_nodes, arg_job_id):
    log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}")
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              cache_discovery=False)

    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True,
                         get_stdout=True).stdout
    node_list = nodes_str.splitlines()

    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't delete from calls by EpilogSlurmctld
        return

    if arg_job_id:
        # Mark nodes as off limits to new jobs while powering down.
        # Have to use "down" because it's the only, current, way to remove the
        # power_up flag from the node -- followed by a power_down -- if the
        # PrologSlurmctld fails with a non-zero exit code.
        util.run(
            f"{SCONTROL} update node={arg_nodes} state=down reason='{arg_job_id} finishing'"
        )
        # Power down nodes in slurm, so that they will become available again.
        util.run(f"{SCONTROL} update node={arg_nodes} state=power_down")

    while True:
        delete_instances(compute, node_list, arg_job_id)
        if not len(retry_list):
            break

        log.debug("got {} nodes to retry ({})".format(len(retry_list),
                                                      ','.join(retry_list)))
        node_list = list(retry_list)
        del retry_list[:]

    if arg_job_id:
        for operation in operations.values():
            try:
                util.wait_for_operation(compute, cfg.project, operation)
                # now that the instance is gone, resume to put back in service
                util.run(f"{SCONTROL} update node={arg_nodes} state=resume")
            except Exception:
                log.exception(
                    f"Error in deleting {operation['name']} to slurm")

    log.debug("done deleting instances")

    if (arg_job_id and cfg.instance_defs[pid].enable_placement
            and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2"
            and len(node_list) > 1):
        delete_placement_groups(compute, node_list, arg_job_id)

    log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")
예제 #2
0
def delete_placement_groups(compute, node_list, arg_job_id):
    PLACEMENT_MAX_CNT = 22
    pg_ops = []
    pg_index = 0
    pid = util.get_pid(node_list[0])

    for i in range(len(node_list)):
        if i % PLACEMENT_MAX_CNT:
            continue
        pg_index += 1
        pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}'
        pg_ops.append(compute.resourcePolicies().delete(
            project=cfg.project, region=cfg.instance_defs[pid].region,
            resourcePolicy=pg_name).execute())
    for operation in pg_ops:
        util.wait_for_operation(compute, cfg.project, operation)
    log.debug("done deleting pg")
예제 #3
0
def add_instances(node_chunk):

    node_list = node_chunk['nodes']
    pg_name = None
    if 'pg' in node_chunk:
        pg_name = node_chunk['pg']
    log.debug(f"node_list:{node_list} pg:{pg_name}")

    auth_http = None
    if not cfg.google_app_cred_path:
        http = set_user_agent(httplib2.Http(),
                              "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)")
        creds = compute_engine.Credentials()
        auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http)
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              http=auth_http,
                                              cache_discovery=False)
    pid = util.get_pid(node_list[0])
    instance_def = cfg.instance_defs[pid]

    try:
        operation = create_instance(compute, instance_def, node_list, pg_name)
    except googleapiclient.errors.HttpError as e:
        log.error(
            f"failed to add {node_list[0]}*{len(node_list)} to slurm, {e}")
        if instance_def.exclusive:
            os._exit(1)
        down_nodes(node_list, e)
        return

    result = util.wait_for_operation(compute, cfg.project, operation)
    if not result or 'error' in result:
        grp_err_msg = result['error']['errors'][0]['message']
        log.error(f"group operation failed: {grp_err_msg}")
        if instance_def.exclusive:
            os._exit(1)

        group_ops = util.get_group_operations(compute, cfg.project, result)
        failed_nodes = {}
        for op in group_ops['items']:
            if op['operationType'] != 'insert':
                continue
            if 'error' in op:
                err_msg = op['error']['errors'][0]['message']
                failed_node = op['targetLink'].split('/')[-1]
                if err_msg not in failed_nodes:
                    failed_nodes[err_msg] = [failed_node]
                else:
                    failed_nodes[err_msg].append(failed_node)
        if failed_nodes:
            log.error(f"insert requests failed: {failed_nodes}")
            for msg, nodes in failed_nodes.items():
                down_nodes(nodes, msg)
예제 #4
0
def create_placement_groups(arg_job_id, vm_count, region):
    log.debug(f"Creating PG: {arg_job_id} vm_count:{vm_count} region:{region}")

    pg_names = []
    pg_ops = []
    pg_index = 0

    auth_http = None
    if not cfg.google_app_cred_path:
        http = set_user_agent(httplib2.Http(),
                              "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)")
        creds = compute_engine.Credentials()
        auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http)
    compute = googleapiclient.discovery.build('compute',
                                              'v1',
                                              http=auth_http,
                                              cache_discovery=False)

    for i in range(vm_count):
        if i % PLACEMENT_MAX_CNT:
            continue
        pg_index += 1
        pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}'
        pg_names.append(pg_name)

        config = {
            'name': pg_name,
            'region': region,
            'groupPlacementPolicy': {
                "collocation": "COLLOCATED",
                "vmCount": min(vm_count - i, PLACEMENT_MAX_CNT)
            }
        }

        pg_ops.append(
            util.ensure_execute(compute.resourcePolicies().insert(
                project=cfg.project, region=region, body=config)))

    for operation in pg_ops:
        result = util.wait_for_operation(compute, cfg.project, operation)
        if result and 'error' in result:
            err_msg = result['error']['errors'][0]['message']
            log.error(f" placement group operation failed: {err_msg}")
            os._exit(1)

    return pg_names
예제 #5
0
def resume_nodes(nodelist, placement_groups=None, exclusive=False):
    """resume nodes in nodelist"""

    def ident_key(n):
        # ident here will refer to the combination of partition and group
        return "-".join(
            (
                lkp.node_partition_name(n),
                lkp.node_group_name(n),
            )
        )

    # support already expanded list
    nodes = nodelist
    if isinstance(nodes, str):
        nodelist = expand_nodelist(nodelist)

    nodes = sorted(nodelist, key=ident_key)
    if len(nodes) == 0:
        return
    grouped_nodes = {
        ident: chunk
        for ident, nodes in groupby(nodes, ident_key)
        for chunk in chunked(nodes, n=BULK_INSERT_LIMIT)
    }
    log.debug(f"grouped_nodes: {grouped_nodes}")

    # make all bulkInsert requests and execute with batch
    inserts = {
        ident: create_instances_request(nodes, placement_groups, exclusive)
        for ident, nodes in grouped_nodes.items()
    }
    started, failed = batch_execute(inserts)
    if failed:
        failed_reqs = [f"{e}" for _, (_, e) in failed.items()]
        log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs)))
        for ident, (_, exc) in failed.items():
            down_nodes(grouped_nodes[ident], exc._get_reason())

    # wait for all bulkInserts to complete and log any errors
    bulk_operations = [wait_for_operation(op) for op in started.values()]
    for bulk_op in bulk_operations:
        if "error" in bulk_op:
            error = bulk_op["error"]["errors"][0]
            log.error(
                f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'"
            )

    # Fetch all insert operations from all bulkInserts. Group by error code and log
    successful_inserts, failed_inserts = separate(
        lambda op: "error" in op, get_insert_operations(bulk_operations)
    )
    # Apparently multiple errors are possible... so join with +.
    # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory.
    grouped_inserts = util.groupby_unsorted(
        failed_inserts,
        lambda op: "+".join(err["code"] for err in op["error"]["errors"]),
    )
    for code, failed_ops in grouped_inserts:
        # at least one insert failure
        failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops]
        hostlist = util.to_hostlist(failed_nodes)
        count = len(failed_nodes)
        log.error(
            f"{count} instances failed to start due to insert operation error: {code} ({hostlist})"
        )
        down_nodes(hostlist, code)
        if log.isEnabledFor(logging.DEBUG):
            msg = "\n".join(
                err["message"] for err in next(failed_ops)["error"]["errors"]
            )
            log.debug(f"{code} message from first node: {msg}")

    # If reconfigure enabled, create subscriptions for successfully started instances
    if lkp.cfg.enable_reconfigure and len(successful_inserts):
        started_nodes = [
            parse_self_link(op["targetLink"]).instance for op in successful_inserts
        ]
        count = len(started_nodes)
        hostlist = util.to_hostlist(started_nodes)
        log.info("create {} subscriptions ({})".format(count, hostlist))
        execute_with_futures(subscription_create, nodes)
예제 #6
0
def main(arg_nodes, arg_job_id):
    log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}")
    compute = googleapiclient.discovery.build('compute', 'v1',
                                              cache_discovery=False)

    # Get node list
    nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}",
                         check=True, get_stdout=True).stdout
    node_list = nodes_str.splitlines()

    # Get static node list
    exc_nodes_hostlist = util.run(
        f"{SCONTROL} show config | "
        "awk '/SuspendExcNodes.*=/{print $3}'", shell=True,
        get_stdout=True).stdout
    nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}",
                             check=True, get_stdout=True).stdout
    node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid)

    # Generate new arg_nodes without static nodes
    dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list))
    node_list = dynamic_nodes
    arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes)

    if len(node_list) == 0:
        log.debug(f"Static nodes removed from request. No nodes remain in request.")
        return

    pid = util.get_pid(node_list[0])
    if (arg_job_id and not cfg.instance_defs[pid].exclusive):
        # Don't delete from calls by EpilogSlurmctld
        return

    if arg_job_id:
        # Mark nodes as off limits to new jobs while powering down.
        # Note: If PrologSlurmctld fails with a non-zero exit code,
        # "powering_up" flag would get stuck on the node. In 20.11 and prior:
        # state=down followed by state=power_down could clear it. In 21.08,
        # state=power_down_force can clear it.
        util.run(
            f"{SCONTROL} update node={arg_nodes} state=power_down_force")

    while True:
        delete_instances(compute, node_list, arg_job_id)
        if not len(retry_list):
            break

        log.debug("got {} nodes to retry ({})"
                  .format(len(retry_list), ','.join(retry_list)))
        node_list = list(retry_list)
        del retry_list[:]

    if arg_job_id:
        for operation in operations.values():
            try:
                util.wait_for_operation(compute, cfg.project, operation)
            except Exception:
                log.exception(f"Error in deleting {operation['name']} to slurm")
        # now that the instances are gone, resume to put back in service
        util.run(f"{SCONTROL} update node={arg_nodes} state=resume")

    log.debug("done deleting instances")

    if (arg_job_id and
            cfg.instance_defs[pid].enable_placement and
            cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and
            len(node_list) > 1):
        delete_placement_groups(compute, node_list, arg_job_id)

    log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")