def analyze(config: Dict, job_id: str, wide: bool = False) -> None: if not wide: try: _, columns_str = os.popen("stty size", "r").read().split() except Exception: columns_str = "120" columns = int(columns_str) else: columns = 120 ctx = DefaultContextHandler("[demand-cli]") register_result_handler(ctx) ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) config = ge_driver.preprocess_config(config) autoscaler.calculate_demand(config, ge_env, ge_driver, ctx) key = "[job {}]".format(job_id) results = ctx.by_context[key] for result in results: if isinstance(result, (EarlyBailoutResult, MatchResult)) and result: continue if isinstance(result, HostgroupConstraint) and not result: continue if wide: print(result.message) else: print(result.message[:columns])
def resources(config: Dict, constraint_expr: str) -> None: ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) node_mgr = new_node_manager(config, existing_nodes=ge_driver) filtered = _query_with_constraints(config, constraint_expr, node_mgr.get_buckets()) columns = set() for node in filtered: columns.update(set(node.resources.keys())) columns.update(set(node.resources.keys())) config["output_columns"]
def nodes( config: Dict, constraint_expr: str, output_columns: Optional[List[str]] = None, output_format: Optional[str] = None, ) -> None: """Query nodes""" ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) dcalc = autoscaler.new_demand_calculator(config, ge_env, ge_driver) filtered = _query_with_constraints(config, constraint_expr, dcalc.node_mgr.get_nodes()) demand_result = DemandResult([], filtered, [], []) autoscaler.print_demand(config, demand_result, output_columns)
def validate_func(config: Dict) -> None: ge_env = environment.from_qconf(config) dcalc = autoscaler.new_demand_calculator(config, ge_env=ge_env) queue: GridEngineQueue failure = False failure = ( validate.validate_hg_intersections(ge_env, dcalc.node_mgr, warn) or failure ) failure = validate.validate_nodes(config, dcalc, warn) or failure for qname, queue in ge_env.queues.items(): failure = validate.validate_queue_has_hosts(queue, ge_env.qbin, warn) or failure failure = validate.validate_ht_hostgroup(queue, ge_env, warn) or failure failure = validate.validate_pe_hostgroups(queue, warn) or failure if failure: sys.exit(1)
def complexes(config: Dict, include_irrelevant: bool = False) -> None: """Prints out, by default, only relevant complexes""" relevant: typing.Optional[typing.Set[str]] if include_irrelevant: ge_config = config.get("gridengine", {}) if "relevant_complexes" in ge_config: ge_config.pop("relevant_complexes") relevant = set(config.get("gridengine", {}).get("relevant_complexes", [])) ge_env = from_qconf(config) already_printed: typing.Set[str] = set() for complex in ge_env.complexes.values(): if (include_irrelevant or complex.name in relevant and complex.name not in already_printed): print(repr(complex)) already_printed.add(complex.name)
def buckets( config: Dict, constraint_expr: str, output_columns: Optional[List[str]] = None, output_format: Optional[str] = None, ) -> None: """Prints out autoscale bucket information, like limits etc""" ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) config = ge_driver.preprocess_config(config) node_mgr = new_node_manager(config) specified_output_columns = output_columns output_columns = output_columns or [ "nodearray", "placement_group", "vm_size", "vcpu_count", "pcpu_count", "memory", "available_count", ] if specified_output_columns is None: for bucket in node_mgr.get_buckets(): for resource_name in bucket.resources: if resource_name not in output_columns: output_columns.append(resource_name) for attr in dir(bucket.limits): if attr[0].isalpha() and "count" in attr: value = getattr(bucket.limits, attr) if isinstance(value, int): bucket.resources[attr] = value bucket.example_node._resources[attr] = value filtered = _query_with_constraints(config, constraint_expr, node_mgr.get_buckets()) demand_result = DemandResult([], [f.example_node for f in filtered], [], []) if "all" in output_columns: output_columns = ["all"] config["output_columns"] = output_columns autoscaler.print_demand(config, demand_result, output_columns, output_format)
def demand( config: Dict, jobs: Optional[str] = None, scheduler_nodes: Optional[str] = None, output_columns: Optional[List[str]] = None, output_format: Optional[str] = None, ) -> None: """Runs autoscale in dry run mode to see the demand for new nodes""" logging.debug("Begin demand") ctx = DefaultContextHandler("[demand-cli]") register_result_handler(ctx) ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) config = ge_driver.preprocess_config(config) demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver, ctx) demand_result = demand_calc.finish() autoscaler.print_demand(config, demand_result, output_columns, output_format) logging.debug("End demand")
def _find_nodes( config: Dict, hostnames: List[str], node_names: List[str] ) -> Tuple[GridEngineDriver, DemandCalculator, List[Node]]: hostnames = hostnames or [] node_names = node_names or [] ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver) demand_result = demand_calc.finish() by_hostname = partition_single( demand_result.compute_nodes, lambda n: n.hostname_or_uuid.lower() ) by_node_name = partition_single( demand_result.compute_nodes, lambda n: n.name.lower() ) found_nodes = [] for hostname in hostnames: if not hostname: error("Please specify a hostname") if hostname.lower() not in by_hostname: # it doesn't exist in CC, but we still want to delete it # from the cluster by_hostname[hostname.lower()] = SchedulerNode(hostname, {}) found_nodes.append(by_hostname[hostname.lower()]) for node_name in node_names: if not node_name: error("Please specify a node_name") if node_name.lower() not in by_node_name: error( "Could not find a CycleCloud node that has node_name %s." + " Run 'nodes' to see available nodes.", node_name, ) found_nodes.append(by_node_name[node_name.lower()]) return ge_driver, demand_calc, found_nodes
def queues(config: Dict) -> None: ge_env = environment.from_qconf(config) schedulers = ge_env.qbin.qconf(["-sss"]).split() rows: List[List[str]] = [] for qname, ge_queue in ge_env.queues.items(): for hgrp in ge_queue.hostlist_groups: fqdns = ge_env.qbin.qconf(["-shgrp", hgrp]).splitlines() for line in fqdns: line = line.strip() if not line: continue if line.startswith("group_name"): continue # trim this out if line.startswith("hostlist "): line = line[len("hostlist ") :] # noqa: E203 for fqdn_expr in line.split(): fqdn_expr = fqdn_expr.strip() if not fqdn_expr or fqdn_expr == "\\": continue host = fqdn_expr.split(".")[0] if host in schedulers: continue rows.append([qname, hgrp, host]) demandprinter.print_rows( columns=["QNAME", "HOSTGROUP", "HOSTNAME"], rows=rows, stream=sys.stdout, output_format="table", )
def validate_func(config: Dict) -> None: ge_env = environment.from_qconf(config) dcalc = autoscaler.new_demand_calculator(config, ge_env=ge_env) queue: GridEngineQueue success = True success = (validate.validate_hg_intersections(ge_env, dcalc.node_mgr, warn) and success) success = validate.validate_nodes(config, dcalc, warn) and success for qname, queue in ge_env.queues.items(): success = (validate.validate_queue_has_hosts(queue, ge_env.qbin, warn) and success) success = validate.validate_ht_hostgroup(queue, ge_env, warn) and success success = validate.validate_pe_hostgroups(queue, warn) and success success = validate.validate_default_hostgroups(config, ge_env, warn) and success success = validate.validate_scheduler_has_no_slots(config, ge_env, warn) and success if not success: sys.exit(1)
def new_demand_calculator( config: Dict, ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, singleton_lock: Optional[SingletonLock] = None, ) -> DemandCalculator: if ge_env is None: ge_env = envlib.from_qconf(config) if ge_driver is None: ge_driver = new_driver(config, ge_env) if node_history is None: db_path = config.get("nodehistorydb") if not db_path: db_dir = "/opt/cycle/jetpack/system/bootstrap/gridengine" if not os.path.exists(db_dir): db_dir = os.getcwd() db_path = os.path.join(db_dir, "nodehistory.db") read_only = config.get("read_only", False) node_history = SQLiteNodeHistory(db_path, read_only) node_history.create_timeout = config.get("boot_timeout", 3600) node_history.last_match_timeout = config.get("idle_timeout", 300) demand_calculator = dcalclib.new_demand_calculator( config, existing_nodes=ge_env.nodes, node_history=node_history, node_queue=ge_driver.new_node_queue(), singleton_lock=singleton_lock, # it will handle the none case ) for name, default_complex in ge_env.complexes.items(): if name == "slots": continue if default_complex.default is None: continue if not default_complex.requestable: continue logging.trace("Adding default resource %s=%s", name, default_complex.default) demand_calculator.node_mgr.add_default_resource( {}, name, default_complex.default) ccnode_id_added = False slots_added: Set[str] = set() for bucket in demand_calculator.node_mgr.get_buckets(): if "slots" not in bucket.resources and bucket.nodearray not in slots_added: default = ( '"default_resources": [{"select": {"node.nodearray": "%s"}, "name": "slots", "value": "node.vcpu_count"}]' % (bucket.nodearray)) demand_calculator.node_mgr.add_default_resource( selection={"node.nodearray": bucket.nodearray}, resource_name="slots", default_value="node.vcpu_count", ) logging.warning( """slots is not defined for bucket {}. Using the default, which you can add to your config: {}""" .format(bucket, default)) slots_added.add(bucket.nodearray) # ccnodeid will almost certainly not be defined. It just needs # to be definede once, so we will add a default for all nodes # the first time we see it is missingg if "ccnodeid" not in bucket.resources and not ccnode_id_added: demand_calculator.node_mgr.add_default_resource( selection={}, # applies to all nodes resource_name="ccnodeid", default_value=lambda n: n.delayed_node_id.node_id, ) ccnode_id_added = True return demand_calculator
def autoscale_grid_engine( config: Dict[str, Any], ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, dry_run: bool = False, ) -> DemandResult: global _exit_code assert not config.get("read_only", False) if dry_run: logging.warning("Running gridengine autoscaler in dry run mode") # allow multiple instances config["lock_file"] = None # put in read only mode config["read_only"] = True if ge_env is None: ge_env = envlib.from_qconf(config) # interface to GE, generally by cli if ge_driver is None: # allow tests to pass in a mock ge_driver = new_driver(config, ge_env) ge_driver.initialize_environment() config = ge_driver.preprocess_config(config) logging.fine("Driver = %s", ge_driver) invalid_nodes = [] # we need an instance without any scheduler nodes, so don't # pass in the existing nodes. tmp_node_mgr = new_node_manager(config) by_hostname = partition_single(tmp_node_mgr.get_nodes(), lambda n: n.hostname_or_uuid) for node in ge_env.nodes: # many combinations of a u and other states. However, # as long as a and u are in there it is down state = node.metadata.get("state", "") cc_node = by_hostname.get(node.hostname) ccnodeid = node.resources.get("ccnodeid") if cc_node: if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id: if cc_node.state in ["Preparing", "Acquiring"]: continue if "a" in state and "u" in state: invalid_nodes.append(node) # nodes in error state must also be deleted nodes_to_delete = ge_driver.clean_hosts(invalid_nodes) for node in nodes_to_delete: ge_env.delete_node(node) demand_calculator = calculate_demand(config, ge_env, ge_driver, ctx_handler, node_history) ge_driver.handle_failed_nodes( demand_calculator.node_mgr.get_failed_nodes()) demand_result = demand_calculator.finish() if ctx_handler: ctx_handler.set_context("[joining]") # details here are that we pass in nodes that matter (matched) and the driver figures out # which ones are new and need to be added via qconf joined = ge_driver.handle_join_cluster( [x for x in demand_result.compute_nodes if x.exists]) ge_driver.handle_post_join_cluster(joined) if ctx_handler: ctx_handler.set_context("[scaling]") # bootup all nodes. Optionally pass in a filtered list if demand_result.new_nodes: if not dry_run: demand_calculator.bootup() if not dry_run: demand_calculator.update_history() # we also tell the driver about nodes that are unmatched. It filters them out # and returns a list of ones we can delete. idle_timeout = int(config.get("idle_timeout", 300)) boot_timeout = int(config.get("boot_timeout", 3600)) logging.fine("Idle timeout is %s", idle_timeout) unmatched_for_5_mins = demand_calculator.find_unmatched_for( at_least=idle_timeout) timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout) # I don't care about nodes that have keep_alive=true timed_out_booting = [n for n in timed_out_booting if not n.keep_alive] timed_out_to_deleted = [] unmatched_nodes_to_delete = [] if timed_out_booting: logging.info("The following nodes have timed out while booting: %s", timed_out_booting) timed_out_to_deleted = ge_driver.handle_boot_timeout( timed_out_booting) or [] if unmatched_for_5_mins: node_expr = ", ".join([str(x) for x in unmatched_for_5_mins]) logging.info("Unmatched for at least %s seconds: %s", idle_timeout, node_expr) unmatched_nodes_to_delete = ( ge_driver.handle_draining(unmatched_for_5_mins) or []) nodes_to_delete = [] for node in timed_out_to_deleted + unmatched_nodes_to_delete: if node.assignments: logging.warning( "%s has jobs assigned to it so we will take no action.", node) continue nodes_to_delete.append(node) if nodes_to_delete: try: logging.info("Deleting %s", [str(n) for n in nodes_to_delete]) delete_result = demand_calculator.delete(nodes_to_delete) if delete_result: # in case it has anything to do after a node is deleted (usually just remove it from the cluster) ge_driver.handle_post_delete(delete_result.nodes) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e)) print_demand(config, demand_result, log=not dry_run) return demand_result
def shell(config: Dict) -> None: """ Provides read only interactive shell. type gehelp() in the shell for more information """ ctx = DefaultContextHandler("[interactive-readonly]") ge_env = environment.from_qconf(config) ge_driver = autoscaler.new_driver(config, ge_env) config = ge_driver.preprocess_config(config) demand_calc = autoscaler.new_demand_calculator(config, ge_env, ge_driver, ctx) queues = ge_env.queues def gehelp() -> None: print("config - dict representing autoscale configuration.") print("dbconn - Read-only SQLite conn to node history") print("demand_calc - DemandCalculator") print("ge_driver - GEDriver object.") print("jobs - List[Job] from ge_driver") print("node_mgr - NodeManager") print("logging - HPCLogging module") print("queues - GridEngineQueue objects") shell_locals = { "config": config, "ctx": ctx, "ge_driver": ge_driver, "demand_calc": demand_calc, "node_mgr": demand_calc.node_mgr, "jobs": ge_env.jobs, "dbconn": demand_calc.node_history.conn, "gehelp": gehelp, "queues": queues, "ge_env": ge_env, } banner = "\nCycleCloud GE Autoscale Shell" interpreter = ReraiseAssertionInterpreter(locals=shell_locals) try: __import__("readline") # some magic - create a completer that is bound to the locals in this interpreter and not # the __main__ interpreter. interpreter.push("import readline, rlcompleter") interpreter.push('readline.parse_and_bind("tab: complete")') interpreter.push("_completer = rlcompleter.Completer(locals())") interpreter.push("def _complete_helper(text, state):") interpreter.push(" ret = _completer.complete(text, state)") interpreter.push(' ret = ret + ")" if ret[-1] == "(" else ret') interpreter.push(" return ret") interpreter.push("") interpreter.push("readline.set_completer(_complete_helper)") for item in interpreter.history_lines: try: if '"""' in item: interpreter.push( "readline.add_history('''%s''')" % item.rstrip("\n") ) else: interpreter.push( 'readline.add_history("""%s""")' % item.rstrip("\n") ) except Exception: pass interpreter.push("from hpc.autoscale.job.job import Job\n") interpreter.push("from hpc.autoscale import hpclogging as logging\n") except ImportError: banner += ( "\nWARNING: `readline` is not installed, so autocomplete will not work." ) interpreter.interact(banner=banner)
def create_support_archive(config: Dict, archive: str) -> None: """ Creates an archive with most logs and configurations required when requesting support. """ ge_env = environment.from_qconf(config) # for some reason mypy doesn't see gzopen tf = tarfile.TarFile.gzopen(archive, "w") # type: ignore def _add(cmd: List[str], name: str) -> None: contents = ge_env.qbin.qconf(cmd) _add_contents(contents, name) def _add_contents(contents: str, name: str) -> None: tarinfo = tarfile.TarInfo("gridengine-support/" + name) tarinfo.size = len(contents) tarinfo.mtime = int(time.time()) fr = io.BytesIO(contents.encode()) tf.addfile(tarinfo, fr) # get our queue definitions for qname in ge_env.queues: _add(["-sq", qname], "queue_{}".format(qname)) # get our parallel env definitions for pe_name in ge_env.pes: _add(["-sp", pe_name], "pe_{}".format(pe_name)) # get a list of hostgroups. Actual definition of hgs is immaterial _add(["-shgrpl"], "hostgroups") # dump out the complexes _add(["-sc"], "complexes") config_no_creds = dict(config) config_no_creds["password"] = "" config_no_creds["cluster_name"] = "" config_no_creds["username"] = "" config_no_creds["url"] = "" _add_contents(json.dumps(config_no_creds, indent=2), "autoscale.json") install_logs = os.path.join(os.getenv("SGE_ROOT", ""), os.getenv("SGE_CELL", ""), "common/install_logs") if os.path.exists(install_logs): for fil in os.listdir(install_logs): path = os.path.join(install_logs, fil) with open(path) as fr: _add_contents(fr.read(), fil) # e.g. /sched/sge/sge-2011.11/default/spool/qmaster/messages spool_dir = os.path.join(os.getenv("SGE_ROOT", ""), os.getenv("SGE_CELL", ""), "spool") if os.path.exists(spool_dir): for hostname in os.listdir(spool_dir): messages_path = os.path.join(spool_dir, hostname, "messages") if os.path.exists(messages_path): with open(messages_path) as fr: _add_contents(fr.read(), "messages_{}".format(hostname)) # may not exist on self-installs chef_client_log = "/opt/cycle/jetpack/logs/chef-client.log" if os.path.exists(chef_client_log): with open(chef_client_log) as fr: _add_contents(fr.read(), "chef-client.log") # find autoscale.log and autoscale.log.1-5 for handler in logging.getLogger().handlers: if hasattr(handler, "baseFilename"): base_filename = getattr(handler, "baseFilename") file_names = [base_filename] + [ base_filename + ".{}".format(n) for n in range(1, 6) ] for fname in file_names: if os.path.exists(fname): with open(fname) as fr: _add_contents(fr.read(), os.path.basename(fname)) tf.close() print("Wrote archive to", archive)
def jobs(config: Dict) -> None: """Writes out Job objects as json""" ge_env = environment.from_qconf(config) util.json_dump(ge_env.jobs)
def jobs_and_nodes(config: Dict) -> None: """Writes out SchedulerNode and Job objects as json - simultaneously to avoid race""" ge_env = environment.from_qconf(config) to_dump = {"jobs": ge_env.jobs, "nodes": ge_env.nodes} util.json_dump(to_dump)
def scheduler_nodes(config: Dict) -> None: """Writes out SchedulerNode objects as json""" ge_env = environment.from_qconf(config) util.json_dump(ge_env.nodes)