def main(arg_nodes, arg_job_id): log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}") compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = nodes_str.splitlines() pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't delete from calls by EpilogSlurmctld return if arg_job_id: # Mark nodes as off limits to new jobs while powering down. # Have to use "down" because it's the only, current, way to remove the # power_up flag from the node -- followed by a power_down -- if the # PrologSlurmctld fails with a non-zero exit code. util.run( f"{SCONTROL} update node={arg_nodes} state=down reason='{arg_job_id} finishing'" ) # Power down nodes in slurm, so that they will become available again. util.run(f"{SCONTROL} update node={arg_nodes} state=power_down") while True: delete_instances(compute, node_list, arg_job_id) if not len(retry_list): break log.debug("got {} nodes to retry ({})".format(len(retry_list), ','.join(retry_list))) node_list = list(retry_list) del retry_list[:] if arg_job_id: for operation in operations.values(): try: util.wait_for_operation(compute, cfg.project, operation) # now that the instance is gone, resume to put back in service util.run(f"{SCONTROL} update node={arg_nodes} state=resume") except Exception: log.exception( f"Error in deleting {operation['name']} to slurm") log.debug("done deleting instances") if (arg_job_id and cfg.instance_defs[pid].enable_placement and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and len(node_list) > 1): delete_placement_groups(compute, node_list, arg_job_id) log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")
def delete_placement_groups(compute, node_list, arg_job_id): PLACEMENT_MAX_CNT = 22 pg_ops = [] pg_index = 0 pid = util.get_pid(node_list[0]) for i in range(len(node_list)): if i % PLACEMENT_MAX_CNT: continue pg_index += 1 pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}' pg_ops.append(compute.resourcePolicies().delete( project=cfg.project, region=cfg.instance_defs[pid].region, resourcePolicy=pg_name).execute()) for operation in pg_ops: util.wait_for_operation(compute, cfg.project, operation) log.debug("done deleting pg")
def add_instances(node_chunk): node_list = node_chunk['nodes'] pg_name = None if 'pg' in node_chunk: pg_name = node_chunk['pg'] log.debug(f"node_list:{node_list} pg:{pg_name}") auth_http = None if not cfg.google_app_cred_path: http = set_user_agent(httplib2.Http(), "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)") creds = compute_engine.Credentials() auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http) compute = googleapiclient.discovery.build('compute', 'v1', http=auth_http, cache_discovery=False) pid = util.get_pid(node_list[0]) instance_def = cfg.instance_defs[pid] try: operation = create_instance(compute, instance_def, node_list, pg_name) except googleapiclient.errors.HttpError as e: log.error( f"failed to add {node_list[0]}*{len(node_list)} to slurm, {e}") if instance_def.exclusive: os._exit(1) down_nodes(node_list, e) return result = util.wait_for_operation(compute, cfg.project, operation) if not result or 'error' in result: grp_err_msg = result['error']['errors'][0]['message'] log.error(f"group operation failed: {grp_err_msg}") if instance_def.exclusive: os._exit(1) group_ops = util.get_group_operations(compute, cfg.project, result) failed_nodes = {} for op in group_ops['items']: if op['operationType'] != 'insert': continue if 'error' in op: err_msg = op['error']['errors'][0]['message'] failed_node = op['targetLink'].split('/')[-1] if err_msg not in failed_nodes: failed_nodes[err_msg] = [failed_node] else: failed_nodes[err_msg].append(failed_node) if failed_nodes: log.error(f"insert requests failed: {failed_nodes}") for msg, nodes in failed_nodes.items(): down_nodes(nodes, msg)
def create_placement_groups(arg_job_id, vm_count, region): log.debug(f"Creating PG: {arg_job_id} vm_count:{vm_count} region:{region}") pg_names = [] pg_ops = [] pg_index = 0 auth_http = None if not cfg.google_app_cred_path: http = set_user_agent(httplib2.Http(), "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)") creds = compute_engine.Credentials() auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http) compute = googleapiclient.discovery.build('compute', 'v1', http=auth_http, cache_discovery=False) for i in range(vm_count): if i % PLACEMENT_MAX_CNT: continue pg_index += 1 pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}' pg_names.append(pg_name) config = { 'name': pg_name, 'region': region, 'groupPlacementPolicy': { "collocation": "COLLOCATED", "vmCount": min(vm_count - i, PLACEMENT_MAX_CNT) } } pg_ops.append( util.ensure_execute(compute.resourcePolicies().insert( project=cfg.project, region=region, body=config))) for operation in pg_ops: result = util.wait_for_operation(compute, cfg.project, operation) if result and 'error' in result: err_msg = result['error']['errors'][0]['message'] log.error(f" placement group operation failed: {err_msg}") os._exit(1) return pg_names
def resume_nodes(nodelist, placement_groups=None, exclusive=False): """resume nodes in nodelist""" def ident_key(n): # ident here will refer to the combination of partition and group return "-".join( ( lkp.node_partition_name(n), lkp.node_group_name(n), ) ) # support already expanded list nodes = nodelist if isinstance(nodes, str): nodelist = expand_nodelist(nodelist) nodes = sorted(nodelist, key=ident_key) if len(nodes) == 0: return grouped_nodes = { ident: chunk for ident, nodes in groupby(nodes, ident_key) for chunk in chunked(nodes, n=BULK_INSERT_LIMIT) } log.debug(f"grouped_nodes: {grouped_nodes}") # make all bulkInsert requests and execute with batch inserts = { ident: create_instances_request(nodes, placement_groups, exclusive) for ident, nodes in grouped_nodes.items() } started, failed = batch_execute(inserts) if failed: failed_reqs = [f"{e}" for _, (_, e) in failed.items()] log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs))) for ident, (_, exc) in failed.items(): down_nodes(grouped_nodes[ident], exc._get_reason()) # wait for all bulkInserts to complete and log any errors bulk_operations = [wait_for_operation(op) for op in started.values()] for bulk_op in bulk_operations: if "error" in bulk_op: error = bulk_op["error"]["errors"][0] log.error( f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'" ) # Fetch all insert operations from all bulkInserts. Group by error code and log successful_inserts, failed_inserts = separate( lambda op: "error" in op, get_insert_operations(bulk_operations) ) # Apparently multiple errors are possible... so join with +. # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory. grouped_inserts = util.groupby_unsorted( failed_inserts, lambda op: "+".join(err["code"] for err in op["error"]["errors"]), ) for code, failed_ops in grouped_inserts: # at least one insert failure failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops] hostlist = util.to_hostlist(failed_nodes) count = len(failed_nodes) log.error( f"{count} instances failed to start due to insert operation error: {code} ({hostlist})" ) down_nodes(hostlist, code) if log.isEnabledFor(logging.DEBUG): msg = "\n".join( err["message"] for err in next(failed_ops)["error"]["errors"] ) log.debug(f"{code} message from first node: {msg}") # If reconfigure enabled, create subscriptions for successfully started instances if lkp.cfg.enable_reconfigure and len(successful_inserts): started_nodes = [ parse_self_link(op["targetLink"]).instance for op in successful_inserts ] count = len(started_nodes) hostlist = util.to_hostlist(started_nodes) log.info("create {} subscriptions ({})".format(count, hostlist)) execute_with_futures(subscription_create, nodes)
def main(arg_nodes, arg_job_id): log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}") compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = nodes_str.splitlines() # Get static node list exc_nodes_hostlist = util.run( f"{SCONTROL} show config | " "awk '/SuspendExcNodes.*=/{print $3}'", shell=True, get_stdout=True).stdout nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}", check=True, get_stdout=True).stdout node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid) # Generate new arg_nodes without static nodes dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list)) node_list = dynamic_nodes arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes) if len(node_list) == 0: log.debug(f"Static nodes removed from request. No nodes remain in request.") return pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't delete from calls by EpilogSlurmctld return if arg_job_id: # Mark nodes as off limits to new jobs while powering down. # Note: If PrologSlurmctld fails with a non-zero exit code, # "powering_up" flag would get stuck on the node. In 20.11 and prior: # state=down followed by state=power_down could clear it. In 21.08, # state=power_down_force can clear it. util.run( f"{SCONTROL} update node={arg_nodes} state=power_down_force") while True: delete_instances(compute, node_list, arg_job_id) if not len(retry_list): break log.debug("got {} nodes to retry ({})" .format(len(retry_list), ','.join(retry_list))) node_list = list(retry_list) del retry_list[:] if arg_job_id: for operation in operations.values(): try: util.wait_for_operation(compute, cfg.project, operation) except Exception: log.exception(f"Error in deleting {operation['name']} to slurm") # now that the instances are gone, resume to put back in service util.run(f"{SCONTROL} update node={arg_nodes} state=resume") log.debug("done deleting instances") if (arg_job_id and cfg.instance_defs[pid].enable_placement and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and len(node_list) > 1): delete_placement_groups(compute, node_list, arg_job_id) log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")