def start_instances(compute, node_list): req_cnt = 0 curr_batch = 0 batch_list = [] batch_list.insert( curr_batch, compute.new_batch_http_request(callback=start_instances_cb)) for node in node_list: if req_cnt >= TOT_REQ_CNT: req_cnt = 0 curr_batch += 1 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=start_instances_cb)) pid = util.get_pid(node) batch_list[curr_batch].add(compute.instances().start( project=cfg.project, zone=cfg.partitions[pid].zone, instance=node), request_id=node) req_cnt += 1 try: for i, batch in enumerate(batch_list): batch.execute() if i < (len(batch_list) - 1): time.sleep(30) except Exception: log.exception("error in start batch: ")
def main(arg_nodes, arg_job_id): log.debug(f"Bursting out: {arg_nodes} {arg_job_id}") # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = sorted(nodes_str.splitlines(), key=util.get_pid) placement_groups = None pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't create from calls by PrologSlurmctld return nodes_by_pid = { k: tuple(nodes) for k, nodes in groupby(node_list, util.get_pid) } if not arg_job_id: for pid in [ pid for pid in nodes_by_pid if cfg.instance_defs[pid].exclusive ]: # Node was created by PrologSlurmctld, skip for ResumeProgram. del nodes_by_pid[pid] if (arg_job_id and cfg.instance_defs[pid].enable_placement): if cfg.instance_defs[pid].machine_type.split('-')[0] != "c2": msg = "Unsupported placement policy configuration. Please utilize c2 machine type." log.error(msg) hold_job(arg_job_id, msg) os._exit(1) elif len(node_list) > 1: log.debug(f"creating placement group for {arg_job_id}") placement_groups = create_placement_groups( arg_job_id, len(node_list), cfg.instance_defs[pid].region) def chunks(lst, pg_names): """ group list into chunks of max size n """ n = 1000 if pg_names: n = PLACEMENT_MAX_CNT pg_index = 0 for i in range(0, len(lst), n): chunk = dict(nodes=lst[i:i + n]) if pg_names: chunk['pg'] = pg_names[pg_index] pg_index += 1 yield chunk # concurrently add nodes grouped by instance_def (pid), max 1000 with ThreadPoolExecutor() as exe: node_chunks = chain.from_iterable( map(partial(chunks, pg_names=placement_groups), nodes_by_pid.values())) exe.map(add_instances, node_chunks) log.info(f"done adding instances: {arg_nodes} {arg_job_id}")
def main(arg_nodes, arg_job_id): log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}") compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = nodes_str.splitlines() pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't delete from calls by EpilogSlurmctld return if arg_job_id: # Mark nodes as off limits to new jobs while powering down. # Have to use "down" because it's the only, current, way to remove the # power_up flag from the node -- followed by a power_down -- if the # PrologSlurmctld fails with a non-zero exit code. util.run( f"{SCONTROL} update node={arg_nodes} state=down reason='{arg_job_id} finishing'" ) # Power down nodes in slurm, so that they will become available again. util.run(f"{SCONTROL} update node={arg_nodes} state=power_down") while True: delete_instances(compute, node_list, arg_job_id) if not len(retry_list): break log.debug("got {} nodes to retry ({})".format(len(retry_list), ','.join(retry_list))) node_list = list(retry_list) del retry_list[:] if arg_job_id: for operation in operations.values(): try: util.wait_for_operation(compute, cfg.project, operation) # now that the instance is gone, resume to put back in service util.run(f"{SCONTROL} update node={arg_nodes} state=resume") except Exception: log.exception( f"Error in deleting {operation['name']} to slurm") log.debug("done deleting instances") if (arg_job_id and cfg.instance_defs[pid].enable_placement and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and len(node_list) > 1): delete_placement_groups(compute, node_list, arg_job_id) log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")
def add_instances(node_chunk): node_list = node_chunk['nodes'] pg_name = None if 'pg' in node_chunk: pg_name = node_chunk['pg'] log.debug(f"node_list:{node_list} pg:{pg_name}") auth_http = None if not cfg.google_app_cred_path: http = set_user_agent(httplib2.Http(), "Slurm_GCP_Scripts/1.2 (GPN:SchedMD)") creds = compute_engine.Credentials() auth_http = google_auth_httplib2.AuthorizedHttp(creds, http=http) compute = googleapiclient.discovery.build('compute', 'v1', http=auth_http, cache_discovery=False) pid = util.get_pid(node_list[0]) instance_def = cfg.instance_defs[pid] try: operation = create_instance(compute, instance_def, node_list, pg_name) except googleapiclient.errors.HttpError as e: log.error( f"failed to add {node_list[0]}*{len(node_list)} to slurm, {e}") if instance_def.exclusive: os._exit(1) down_nodes(node_list, e) return result = util.wait_for_operation(compute, cfg.project, operation) if not result or 'error' in result: grp_err_msg = result['error']['errors'][0]['message'] log.error(f"group operation failed: {grp_err_msg}") if instance_def.exclusive: os._exit(1) group_ops = util.get_group_operations(compute, cfg.project, result) failed_nodes = {} for op in group_ops['items']: if op['operationType'] != 'insert': continue if 'error' in op: err_msg = op['error']['errors'][0]['message'] failed_node = op['targetLink'].split('/')[-1] if err_msg not in failed_nodes: failed_nodes[err_msg] = [failed_node] else: failed_nodes[err_msg].append(failed_node) if failed_nodes: log.error(f"insert requests failed: {failed_nodes}") for msg, nodes in failed_nodes.items(): down_nodes(nodes, msg)
def delete_instances(compute, node_list, arg_job_id): batch_list = [] curr_batch = 0 req_cnt = 0 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=delete_instances_cb)) for node_name in node_list: pid = util.get_pid(node_name) if (not arg_job_id and cfg.instance_defs[pid].exclusive): # Node was deleted by EpilogSlurmctld, skip for SuspendProgram continue if req_cnt >= TOT_REQ_CNT: req_cnt = 0 curr_batch += 1 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=delete_instances_cb)) zone = None if cfg.instance_defs[pid].regional_capacity: node_find = util.ensure_execute( compute.instances().aggregatedList( project=cfg.project, filter=f'name={node_name}')) for key, zone_value in node_find['items'].items(): if 'instances' in zone_value: zone = zone_value['instances'][0]['zone'].split('/')[-1] break if zone is None: log.error(f"failed to find regional node '{node_name}' to delete") continue else: zone = cfg.instance_defs[pid].zone batch_list[curr_batch].add( compute.instances().delete(project=cfg.project, zone=zone, instance=node_name), request_id=node_name) req_cnt += 1 try: for i, batch in enumerate(batch_list): util.ensure_execute(batch) if i < (len(batch_list) - 1): time.sleep(30) except Exception: log.exception("error in batch:")
def delete_instances(compute, node_list, arg_job_id): batch_list = [] curr_batch = 0 req_cnt = 0 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=delete_instances_cb)) def_list = { pid: cfg.instance_defs[pid] for pid, nodes in groupby(node_list, util.get_pid) } regional_instances = util.get_regional_instances(compute, cfg.project, def_list) for node_name in node_list: pid = util.get_pid(node_name) if (not arg_job_id and cfg.instance_defs[pid].exclusive): # Node was deleted by EpilogSlurmctld, skip for SuspendProgram continue zone = None if cfg.instance_defs[pid].regional_capacity: instance = regional_instances.get(node_name, None) if instance is None: log.debug("Regional node not found. Already deleted?") continue zone = instance['zone'].split('/')[-1] else: zone = cfg.instance_defs[pid].zone if req_cnt >= TOT_REQ_CNT: req_cnt = 0 curr_batch += 1 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=delete_instances_cb)) batch_list[curr_batch].add(compute.instances().delete( project=cfg.project, zone=zone, instance=node_name), request_id=node_name) req_cnt += 1 try: for i, batch in enumerate(batch_list): util.ensure_execute(batch) if i < (len(batch_list) - 1): time.sleep(30) except Exception: log.exception("error in batch:")
def delete_placement_groups(compute, node_list, arg_job_id): PLACEMENT_MAX_CNT = 22 pg_ops = [] pg_index = 0 pid = util.get_pid(node_list[0]) for i in range(len(node_list)): if i % PLACEMENT_MAX_CNT: continue pg_index += 1 pg_name = f'{cfg.cluster_name}-{arg_job_id}-{pg_index}' pg_ops.append(compute.resourcePolicies().delete( project=cfg.project, region=cfg.instance_defs[pid].region, resourcePolicy=pg_name).execute()) for operation in pg_ops: util.wait_for_operation(compute, cfg.project, operation) log.debug("done deleting pg")
def update_slurm_node_addrs(compute): for node_name, operation in operations.items(): try: # Do this after the instances have been initialized and then wait # for all operations to finish. Then updates their addrs. wait_for_operation(compute, cfg.project, operation) pid = util.get_pid(node_name) my_fields = 'networkInterfaces(name,network,networkIP,subnetwork)' instance_networks = compute.instances().get( project=cfg.project, zone=cfg.partitions[pid].zone, instance=node_name, fields=my_fields).execute() instance_ip = instance_networks['networkInterfaces'][0]['networkIP'] util.run( f"{SCONTROL} update node={node_name} nodeaddr={instance_ip}") log.info("Instance " + node_name + " is now up") except Exception: log.exception(f"Error in adding {node_name} to slurm")
def get_source_image(compute, node_name): images = get_source_image.images pid = util.get_pid(node_name) if pid not in images: image_name = f"{cfg.compute_node_prefix}-{pid}-image" family = (cfg.partitions[pid].compute_image_family or f"{image_name}-family") try: image_response = compute.images().getFromFamily( project=cfg.project, family=family).execute() if image_response['status'] != 'READY': raise Exception("Image not ready") source_disk_image = image_response['selfLink'] except Exception as e: log.error(f"Image {family} unavailable: {e}") sys.exit() images[pid] = source_disk_image return images[pid]
def start_instances(compute, node_list, gcp_nodes): req_cnt = 0 curr_batch = 0 batch_list = [] batch_list.insert( curr_batch, compute.new_batch_http_request(callback=start_instances_cb)) for node in node_list: pid = util.get_pid(node) zone = cfg.instance_defs[pid].zone if cfg.instance_defs[pid].regional_capacity: g_node = gcp_nodes.get(node, None) if not g_node: log.error(f"Didn't find regional GCP record for '{node}'") continue zone = g_node['zone'].split('/')[-1] if req_cnt >= TOT_REQ_CNT: req_cnt = 0 curr_batch += 1 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=start_instances_cb)) batch_list[curr_batch].add( compute.instances().start(project=cfg.project, zone=zone, instance=node), request_id=node) req_cnt += 1 try: for i, batch in enumerate(batch_list): util.ensure_execute(batch) if i < (len(batch_list) - 1): time.sleep(30) except Exception: log.exception("error in start batch: ")
def add_instances(compute, node_list): batch_list = [] curr_batch = 0 req_cnt = 0 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=added_instances_cb)) for node_name in node_list: if req_cnt >= TOT_REQ_CNT: req_cnt = 0 curr_batch += 1 batch_list.insert( curr_batch, compute.new_batch_http_request(callback=added_instances_cb)) source_disk_image = get_source_image(compute, node_name) pid = util.get_pid(node_name) batch_list[curr_batch].add( create_instance(compute, cfg.partitions[pid].zone, cfg.partitions[pid].machine_type, node_name, source_disk_image), request_id=node_name) req_cnt += 1 try: for i, batch in enumerate(batch_list): batch.execute(http=http) if i < (len(batch_list) - 1): time.sleep(30) except Exception: log.exception("error in add batch") if cfg.update_node_addrs: update_slurm_node_addrs(compute)
def main(): compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) try: s_nodes = dict() cmd = (f"{SCONTROL} show nodes | " r"grep -oP '^NodeName=\K(\S+)|State=\K(\S+)' | " "paste -sd',\n'") nodes = util.run(cmd, shell=True, check=True, get_stdout=True).stdout if nodes: # result is a list of tuples like: # (nodename, (base='base_state', flags=<set of state flags>)) # from 'nodename,base_state+flag1+flag2' # state flags include: CLOUD, COMPLETING, DRAIN, FAIL, POWER, # POWERING_DOWN # Modifiers on base state still include: @ (reboot), $ (maint), # * (nonresponsive), # (powering up) StateTuple = collections.namedtuple('StateTuple', 'base,flags') def make_state_tuple(state): return StateTuple(state[0], set(state[1:])) s_nodes = [(node, make_state_tuple(args.split('+'))) for node, args in map(lambda x: x.split(','), nodes.rstrip().splitlines()) if 'CLOUD' in args] g_nodes = [] for i, part in enumerate(cfg.partitions): page_token = "" while True: resp = compute.instances().list( project=cfg.project, zone=part.zone, pageToken=page_token, filter=f"name={cfg.compute_node_prefix}-{i}-*").execute() if "items" in resp: g_nodes.extend(resp['items']) if "nextPageToken" in resp: page_token = resp['nextPageToken'] continue break to_down = [] to_idle = [] to_start = [] for s_node, s_state in s_nodes: g_node = next((item for item in g_nodes if item["name"] == s_node), None) pid = util.get_pid(s_node) if (('POWER' not in s_state.flags) and ('POWERING_DOWN' not in s_state.flags)): # slurm nodes that aren't in power_save and are stopped in GCP: # mark down in slurm # start them in gcp if g_node and (g_node['status'] == "TERMINATED"): if not s_state.base.startswith('DOWN'): to_down.append(s_node) if (cfg.partitions[pid].preemptible_bursting): to_start.append(s_node) # can't check if the node doesn't exist in GCP while the node # is booting because it might not have been created yet by the # resume script. # This should catch the completing states as well. if (g_node is None and "#" not in s_state.base and not s_state.base.startswith('DOWN')): to_down.append(s_node) elif g_node is None: # find nodes that are down~ in slurm and don't exist in gcp: # mark idle~ if s_state.base.startswith( 'DOWN') and 'POWER' in s_state.flags: to_idle.append(s_node) elif 'POWERING_DOWN' in s_state.flags: to_idle.append(s_node) elif s_state.base.startswith('COMPLETING'): to_down.append(s_node) if len(to_down): log.info("{} stopped/deleted instances ({})".format( len(to_down), ",".join(to_down))) log.info("{} instances to start ({})".format( len(to_start), ",".join(to_start))) # write hosts to a file that can be given to get a slurm # hostlist. Since the number of hosts could be large. tmp_file = tempfile.NamedTemporaryFile(mode='w+t', delete=False) tmp_file.writelines("\n".join(to_down)) tmp_file.close() log.debug("tmp_file = {}".format(tmp_file.name)) hostlist = util.run(f"{SCONTROL} show hostlist {tmp_file.name}", check=True, get_stdout=True).stdout log.debug("hostlist = {}".format(hostlist)) os.remove(tmp_file.name) util.run(f"{SCONTROL} update nodename={hostlist} state=down " "reason='Instance stopped/deleted'") while True: start_instances(compute, to_start) if not len(retry_list): break log.debug("got {} nodes to retry ({})".format( len(retry_list), ','.join(retry_list))) to_start = list(retry_list) del retry_list[:] if len(to_idle): log.info("{} instances to resume ({})".format( len(to_idle), ','.join(to_idle))) # write hosts to a file that can be given to get a slurm # hostlist. Since the number of hosts could be large. tmp_file = tempfile.NamedTemporaryFile(mode='w+t', delete=False) tmp_file.writelines("\n".join(to_idle)) tmp_file.close() log.debug("tmp_file = {}".format(tmp_file.name)) hostlist = util.run(f"{SCONTROL} show hostlist {tmp_file.name}", check=True, get_stdout=True).stdout log.debug("hostlist = {}".format(hostlist)) os.remove(tmp_file.name) util.run(f"{SCONTROL} update nodename={hostlist} state=resume") except Exception: log.exception("failed to sync instances")
def create_instance(compute, zone, machine_type, instance_name, source_disk_image): pid = util.get_pid(instance_name) # Configure the machine machine_type_path = f'zones/{zone}/machineTypes/{machine_type}' disk_type = 'projects/{}/zones/{}/diskTypes/{}'.format( cfg.project, zone, cfg.partitions[pid].compute_disk_type) config = { 'name': instance_name, 'machineType': machine_type_path, # Specify the boot disk and the image to use as a source. 'disks': [{ 'boot': True, 'autoDelete': True, 'initializeParams': { 'sourceImage': source_disk_image, 'diskType': disk_type, 'diskSizeGb': cfg.partitions[pid].compute_disk_size_gb } }], # Specify a network interface 'networkInterfaces': [{ 'subnetwork': ( "projects/{}/regions/{}/subnetworks/{}".format( cfg.shared_vpc_host_project or cfg.project, cfg.partitions[pid].region, (cfg.partitions[pid].vpc_subnet or f'{cfg.cluster_name}-{cfg.partitions[pid].region}')) ), }], # Allow the instance to access cloud storage and logging. 'serviceAccounts': [{ 'email': cfg.compute_node_service_account, 'scopes': cfg.compute_node_scopes }], 'tags': {'items': ['compute']}, 'metadata': { 'items': [ {'key': 'enable-oslogin', 'value': 'TRUE'}, {'key': 'VmDnsSetting', 'value': 'GlobalOnly'} ] } } shutdown_script_path = Path('/apps/slurm/scripts/compute-shutdown') if shutdown_script_path.exists(): config['metadata']['items'].append({ 'key': 'shutdown-script', 'value': shutdown_script_path.read_text() }) if cfg.partitions[pid].gpu_type: accel_type = ('https://www.googleapis.com/compute/v1/projects/{}/zones/{}/acceleratorTypes/{}' .format(cfg.project, zone, cfg.partitions[pid].gpu_type)) config['guestAccelerators'] = [{ 'acceleratorCount': cfg.partitions[pid].gpu_count, 'acceleratorType': accel_type }] config['scheduling'] = {'onHostMaintenance': 'TERMINATE'} if cfg.partitions[pid].preemptible_bursting: config['scheduling'] = { 'preemptible': True, 'onHostMaintenance': 'TERMINATE', 'automaticRestart': False }, if cfg.partitions[pid].compute_labels: config['labels'] = cfg.partitions[pid].compute_labels, if cfg.partitions[pid].cpu_platform: config['minCpuPlatform'] = cfg.partitions[pid].cpu_platform, if cfg.external_compute_ips: config['networkInterfaces'][0]['accessConfigs'] = [ {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} ] return compute.instances().insert( project=cfg.project, zone=zone, body=config)
def main(): compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) try: s_nodes = dict() cmd = (f"{SCONTROL} show nodes | " r"grep -oP '^NodeName=\K(\S+)|State=\K(\S+)' | " "paste -sd',\n'") nodes = util.run(cmd, shell=True, check=True, get_stdout=True).stdout if nodes: # result is a list of tuples like: # (nodename, (base='base_state', flags=<set of state flags>)) # from 'nodename,base_state+flag1+flag2' # state flags include: CLOUD, COMPLETING, DRAIN, FAIL, POWERED_DOWN, # POWERING_DOWN # Modifiers on base state still include: @ (reboot), $ (maint), # * (nonresponsive), # (powering up) StateTuple = collections.namedtuple('StateTuple', 'base,flags') def make_state_tuple(state): return StateTuple(state[0], set(state[1:])) s_nodes = {node: make_state_tuple(args.split('+')) for node, args in map(lambda x: x.split(','), nodes.rstrip().splitlines()) if 'CLOUD' in args} g_nodes = util.get_regional_instances(compute, cfg.project, cfg.instance_defs) for pid, part in cfg.instance_defs.items(): page_token = "" while True: if not part.regional_capacity: resp = util.ensure_execute( compute.instances().list( project=cfg.project, zone=part.zone, fields='items(name,zone,status),nextPageToken', pageToken=page_token, filter=f"name={pid}-*")) if "items" in resp: g_nodes.update({instance['name']: instance for instance in resp['items']}) if "nextPageToken" in resp: page_token = resp['nextPageToken'] continue break to_down = [] to_idle = [] to_start = [] for s_node, s_state in s_nodes.items(): g_node = g_nodes.get(s_node, None) pid = util.get_pid(s_node) if (('POWERED_DOWN' not in s_state.flags) and ('POWERING_DOWN' not in s_state.flags)): # slurm nodes that aren't powered down and are stopped in GCP: # mark down in slurm # start them in gcp if g_node and (g_node['status'] == "TERMINATED"): if not s_state.base.startswith('DOWN'): to_down.append(s_node) if cfg.instance_defs[pid].preemptible_bursting != 'false': to_start.append(s_node) # can't check if the node doesn't exist in GCP while the node # is booting because it might not have been created yet by the # resume script. # This should catch the completing states as well. if (g_node is None and "POWERING_UP" not in s_state.flags and not s_state.base.startswith('DOWN')): to_down.append(s_node) elif g_node is None: # find nodes that are down~ in slurm and don't exist in gcp: # mark idle~ if s_state.base.startswith('DOWN') and 'POWERED_DOWN' in s_state.flags: to_idle.append(s_node) elif 'POWERING_DOWN' in s_state.flags: to_idle.append(s_node) elif s_state.base.startswith('COMPLETING'): to_down.append(s_node) if len(to_down): log.info("{} stopped/deleted instances ({})".format( len(to_down), ",".join(to_down))) log.info("{} instances to start ({})".format( len(to_start), ",".join(to_start))) hostlist = to_hostlist(to_down) util.run(f"{SCONTROL} update nodename={hostlist} state=down " "reason='Instance stopped/deleted'") while True: start_instances(compute, to_start, g_nodes) if not len(retry_list): break log.debug("got {} nodes to retry ({})" .format(len(retry_list), ','.join(retry_list))) to_start = list(retry_list) del retry_list[:] if len(to_idle): log.info("{} instances to resume ({})".format( len(to_idle), ','.join(to_idle))) hostlist = to_hostlist(to_idle) util.run(f"{SCONTROL} update nodename={hostlist} state=resume") orphans = [ inst for inst, info in g_nodes.items() if info['status'] == 'RUNNING' and ( inst not in s_nodes or 'POWERED_DOWN' in s_nodes[inst].flags ) ] if orphans: if args.debug: for orphan in orphans: info = g_nodes.get(orphan) state = s_nodes.get(orphan, None) log.debug(f"orphan {orphan}: status={info['status']} state={state}") hostlist = to_hostlist(orphans) log.info(f"{len(orphans)} orphan instances found to terminate: {hostlist}") util.run(f"{SCRIPTS_DIR}/suspend.py {hostlist}") except Exception: log.exception("failed to sync instances")
def find_webviews(self, dev): if self.appname is None: raise Exception("WebGrabber.find_webviews without appname") app_pid = util.get_pid(dev, self.appname) if app_pid is None: return [] logger.info( "forwarding tcp:%d to localabstract:webview_devtools_remote_%s" % (fwd_port, app_pid)) dev.run_adb_cmd( "forward", "tcp:%d localabstract:webview_devtools_remote_%s" % (fwd_port, app_pid)) count = 0 rets = [] try: req = urllib.request.urlopen("http://127.0.0.1:%d/json" % fwd_port) ret = json.loads(req.read().decode('utf-8')) except: logger.warning("fail to connect to webview") return [] for item in ret: try: desc = json.loads(item['description']) except: logger.exception("fail to parse description %s", item['description']) raise if not desc['attached']: continue count += 1 base_x = desc['screenX'] base_y = desc['screenY'] if 'width' in desc: base_w = desc['width'] base_h = desc['height'] else: base_w = base_h = 0 if 'empty' in desc: empty = desc['empty'] else: empty = True logger.info("found %s %s", item['url'].split('?')[0], item['title']) url = item['url'] ws_url = item['webSocketDebuggerUrl'] title = item['title'] rets.append({ 'base_x': base_x, 'base_y': base_y, 'base_w': base_w, 'base_h': base_h, 'url': url, 'ws_url': ws_url, 'title': title, 'empty': empty }) logger.info("captured %d webviews", count) return rets
def main(arg_nodes, arg_job_id): log.debug(f"deleting nodes:{arg_nodes} job_id:{job_id}") compute = googleapiclient.discovery.build('compute', 'v1', cache_discovery=False) # Get node list nodes_str = util.run(f"{SCONTROL} show hostnames {arg_nodes}", check=True, get_stdout=True).stdout node_list = nodes_str.splitlines() # Get static node list exc_nodes_hostlist = util.run( f"{SCONTROL} show config | " "awk '/SuspendExcNodes.*=/{print $3}'", shell=True, get_stdout=True).stdout nodes_exc_str = util.run(f"{SCONTROL} show hostnames {exc_nodes_hostlist}", check=True, get_stdout=True).stdout node_exc_list = sorted(nodes_exc_str.splitlines(), key=util.get_pid) # Generate new arg_nodes without static nodes dynamic_nodes = list((set(node_exc_list) ^ set(node_list)) & set(node_list)) node_list = dynamic_nodes arg_nodes = util.to_hostlist(SCONTROL, dynamic_nodes) if len(node_list) == 0: log.debug(f"Static nodes removed from request. No nodes remain in request.") return pid = util.get_pid(node_list[0]) if (arg_job_id and not cfg.instance_defs[pid].exclusive): # Don't delete from calls by EpilogSlurmctld return if arg_job_id: # Mark nodes as off limits to new jobs while powering down. # Note: If PrologSlurmctld fails with a non-zero exit code, # "powering_up" flag would get stuck on the node. In 20.11 and prior: # state=down followed by state=power_down could clear it. In 21.08, # state=power_down_force can clear it. util.run( f"{SCONTROL} update node={arg_nodes} state=power_down_force") while True: delete_instances(compute, node_list, arg_job_id) if not len(retry_list): break log.debug("got {} nodes to retry ({})" .format(len(retry_list), ','.join(retry_list))) node_list = list(retry_list) del retry_list[:] if arg_job_id: for operation in operations.values(): try: util.wait_for_operation(compute, cfg.project, operation) except Exception: log.exception(f"Error in deleting {operation['name']} to slurm") # now that the instances are gone, resume to put back in service util.run(f"{SCONTROL} update node={arg_nodes} state=resume") log.debug("done deleting instances") if (arg_job_id and cfg.instance_defs[pid].enable_placement and cfg.instance_defs[pid].machine_type.split('-')[0] == "c2" and len(node_list) > 1): delete_placement_groups(compute, node_list, arg_job_id) log.info(f"done deleting nodes:{arg_nodes} job_id:{job_id}")