def handle_post_delete(self, nodes: List[Node]) -> List[Node]: ret = [] for node in nodes: if not node.hostname: continue try: self.pbscmd.qmgr("list", "node", node.hostname) except CalledProcessError as e: if "Server has no node list" in str(e): ret.append(node) continue logging.error("Could not list node with hostname %s - %s", node.hostname, e) continue try: self.pbscmd.qmgr("delete", "node", node.hostname) node.metadata["pbs_state"] = "deleted" ret.append(node) except CalledProcessError as e: logging.error( "Could not remove %s from cluster: %s. Will retry next cycle.", node, e, ) return ret
def call(cmd: List[str]) -> None: shlexed = " ".join([shlex.quote(x) for x in cmd]) logging.trace("Running '%s'", shlexed) _QCMD_LOGGER.info(shlexed) stderr = "" completed_process = None try: # capture_output was added in 3.7 and we support as far back as 3.6 if sys.version_info < (3, 7): completed_process = subprocess.run(cmd, stderr=subprocess.PIPE) else: completed_process = subprocess.run(cmd, capture_output=True) if completed_process.returncode != 0: if completed_process.stderr: stderr = completed_process.stderr.decode() logging.warning( "'%s' failed with exit %d: Stderr '%s'", shlexed, completed_process.returncode, stderr, ) except Exception as e: logging.error("'%s' failed: %s.", shlexed, str(e)) _QCMD_LOGGER.error(">> %s", str(e)) raise
def __init__( self, sched_dict: Dict[str, str], resource_state: ResourceState, ) -> None: btype = BooleanType() self.do_not_span_psets = btype.parse( sched_dict.get("do_not_span_psets", "false")) self.scheduling = btype.parse(sched_dict["scheduling"]) self.only_explicit_psets = btype.parse( sched_dict.get("only_explicit_psets", "false")) self.node_group_enable = btype.parse( sched_dict.get("node_group_enable", "false")) self.node_group_key = sched_dict.get("node_group_key") self.sched_log = sched_dict["sched_log"] self.sched_priv = sched_dict["sched_priv"] priv_config_path = os.path.join(self.sched_priv, "sched_config") self.resources_for_scheduling = (get_pbspro_parser( ).parse_resources_from_sched_priv(priv_config_path)) self.state = sched_dict["state"] self.hostname = sched_dict["sched_host"].split(".")[0] self.resource_state = resource_state try: self.pbs_version: Tuple = tuple( [int(x) for x in sched_dict["pbs_version"].split(".")]) except ValueError: self.pbs_version = tuple(sched_dict["pbs_version"].split(".")) self.sched_dict = sched_dict if not self.only_explicit_psets: logging.error( "only_explicit_psets must be set to true. You can change this by running:" + ' qmgr -c "set sched default only_explicit_psets = true')
def try_parse(k: str, default: float) -> float: try: return float(config.get(k, default)) except ValueError: logging.error( "Could not parse %s as a float", config.get(k), ) return default
def check_call(cmd: List[str], *args: Any, **kwargs: Any) -> None: shlexed = " ".join([shlex.quote(x) for x in cmd]) logging.trace("Running '%s'", shlexed) _QCMD_LOGGER.info(shlexed) try: _check_call(cmd, *args, **kwargs) except Exception as e: logging.error("'%s' failed: %s", shlexed, str(e)) _QCMD_LOGGER.error(">> %s", str(e)) raise
def __init__(self, attr: str, *values: typing.Union[None, ht.ResourceTypeAtom]) -> None: self.attr = attr from hpc.autoscale.node.node import QUERYABLE_PROPERTIES if attr not in QUERYABLE_PROPERTIES: msg = "Property[name={}] not defined for Node".format(self.attr) logging.error(msg) raise ValueError("UndefinedNodeProperty: " + msg) if len(values) == 1 and isinstance(values[0], list): self.values: List[Optional[ht.ResourceTypeAtom]] = values[0] else: self.values = list(values)
def check_output(cmd: List[str], *args: Any, **kwargs: Any) -> Any: if not cmd or not cmd[0]: raise RuntimeError( "Could not run the following command {}. Please check your PATH". format(cmd)) kwargs["stderr"] = kwargs.pop("stderr", STDOUT) shlexed = " ".join([shlex.quote(x) for x in cmd]) logging.trace("Running '%s'", shlexed) _QCMD_LOGGER.info(shlexed) try: return _check_output(cmd, *args, **kwargs).decode() except Exception as e: logging.error("'%s' failed: %s", shlexed, str(e)) _QCMD_LOGGER.error(">> %s", str(e)) raise
def _post(self, function_name: str, function_route: str, data) -> Response: headers = {"Content-Type": "application/json"} url = function_route.format(self.hostname) res = requests.post(url, data=data, headers=headers, verify=False, cert=self._pem) try: res.raise_for_status() logging.info("{} resp: {}".format(function_name, str(res.content))) return res except HTTPError: logging.error("{}: status_code:{} content:{}".format( function_name, res.status_code, res.content)) raise
def handle_draining(self, nodes: List[Node]) -> List[Node]: # TODO batch these up, but keep it underneath the # max arg limit ret = [] for node in nodes: if not node.hostname: logging.info("Node %s has no hostname.", node) continue # TODO implement after we have resources added back in # what about deleting partially initialized nodes? I think we # just need to skip non-managed nodes # if not node.resources.get("ccnodeid"): # continue if not node.managed and not node.resources.get("ccnodeid"): logging.debug("Ignoring attempt to drain unmanaged %s", node) continue if "offline" in node.metadata.get("pbs_state", ""): if node.assignments: logging.info("Node %s has jobs still running on it.", node) # node is already 'offline' i.e. draining, but a job is still running continue else: # ok - it is offline _and_ no jobs are running on it. ret.append(node) else: try: self.pbscmd.pbsnodes("-o", node.hostname) # # Due to a delay in when pbsnodes -o exits to when pbsnodes -a # # actually reports an offline state, w ewill just optimistically set it to offline # # otherwise ~50% of the time you get the old state (free) # response = self.pbscmd.pbsnodes_parsed("-a", node.hostname) # if response: # node.metadata["pbs_state"] = response[0]["state"] node.metadata["pbs_state"] = "offline" except CalledProcessError as e: if node.private_ip: logging.error( "'pbsnodes -o %s' failed and this node will not be scaled down: %s", node.hostname, e, ) return ret
def qstat_json(self, *args: str) -> Dict: if "-F" not in args: args = ("-F", "json") + args response = self.qstat(*args) # For some reason both json and regular format are printed... expr = response # fix invalid json output like the following # "pset":"group_id=""", expr = expr.replace('"""', '"') attempts = 1000 while "{" in expr and attempts > 0: attempts -= 1 expr = expr[expr.index("{"):] try: return json.loads(expr) except JSONDecodeError as e: logging.error(e) raise RuntimeError( "Could not parse qstat json output: '{}'".format(response))
def _validate_reverse_dns(self, node: Node) -> bool: # let's make sure the hostname is valid and reverse # dns compatible before adding to GE # if there is no private ip, then the hostname was removed, most likely # by azure DNS if not node.private_ip: return True try: addr_info = socket.gethostbyaddr(node.private_ip) except Exception as e: logging.error( "Could not convert private_ip(%s) to hostname using gethostbyaddr() for %s: %s", node.private_ip, node, str(e), ) return False addr_info_ips = addr_info[-1] if isinstance(addr_info_ips, str): addr_info_ips = [addr_info_ips] if node.private_ip not in addr_info_ips: logging.warning( "%s has a hostname that does not match the" + " private_ip (%s) reported by cyclecloud (%s)! Skipping", node, addr_info_ips, node.private_ip, ) return False expect_multiple_entries = (node.software_configuration.get( "cyclecloud", {}).get("hosts", {}).get("standalone_dns", {}).get("enabled", True)) addr_info_hostname = addr_info[0].split(".")[0] if addr_info_hostname.lower() != node.hostname.lower(): if expect_multiple_entries: logging.warning( "%s has a hostname that can not be queried via reverse" + " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)." + " This is common and usually repairs itself. Skipping", node, node.private_ip, node.hostname, addr_info_hostname, ) else: logging.error( "%s has a hostname that can not be queried via reverse" + " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)." + " If you have an entry for this address in your /etc/hosts file, please remove it.", node, node.private_ip, node.hostname, addr_info_hostname, ) return False return True
def add_nodes_to_cluster(self, nodes: List[Node]) -> List[Node]: self.initialize() all_nodes = self.pbscmd.pbsnodes_parsed("-a") by_ccnodeid = partition( all_nodes, lambda x: x.get("resources_available.ccnodeid")) ret = [] for node in nodes: if not node.hostname: continue if not node.private_ip: continue node_id = node.delayed_node_id.node_id if not node_id: logging.error("%s does not have a nodeid! Skipping", node) continue if node_id in by_ccnodeid: skip_node = False for ndict in by_ccnodeid[node_id]: if ndict["name"].lower() != node.hostname.lower(): logging.error( "Duplicate hostname found for the same node id! %s and %s. See 'valid_hostnames' in autoscale as a possible workaround.", node, ndict["name"], ) skip_node = True break if skip_node: continue if not is_valid_hostname(self.config, node): continue if not self._validate_reverse_dns(node): logging.fine( "%s still has a hostname that can not be looked via reverse dns. This should repair itself.", node, ) continue if not node.resources.get("ccnodeid"): logging.info( "%s is not managed by CycleCloud, or at least 'ccnodeid' is not defined. Ignoring", node, ) continue try: try: ndicts = self.pbscmd.qmgr_parsed("list", "node", node.hostname) if ndicts and ndicts[0].get( "resources_available.ccnodeid"): logging.info( "ccnodeid is already defined on %s. Skipping", node) continue # TODO RDH should we just delete it instead? logging.info( "%s already exists in this cluster. Setting resources.", node) except CalledProcessError: logging.info( "%s does not exist in this cluster yet. Creating.", node) self.pbscmd.qmgr("create", "node", node.hostname) for res_name, res_value in node.resources.items(): # we set ccnodeid last, so that we can see that we have completely joined a node # if and only if ccnodeid has been set if res_name == "ccnodeid": continue if res_value is None: continue # TODO RDH track down if res_name == "group_id" and res_value == "None": continue # skip things like host which are useful to set default resources on non-existent # nodes for autoscale packing, but not on actual nodes if res_name in self.read_only_resources: continue if res_name not in self.resource_definitions: # TODO bump to a warning? logging.fine( "%s is an unknown PBS resource for node %s. Skipping this resource", res_name, node, ) continue res_value_str: str # pbs size does not support decimals if isinstance(res_value, ht.Size): res_value_str = "{}{}".format(int(res_value.value), res_value.magnitude) elif isinstance(res_value, bool): res_value_str = "1" if bool else "0" else: res_value_str = str(res_value) self.pbscmd.qmgr( "set", "node", node.hostname, "resources_available.{}={}".format( res_name, res_value_str), ) self.pbscmd.qmgr( "set", "node", node.hostname, "resources_available.{}={}".format( "ccnodeid", node.resources["ccnodeid"]), ) self.pbscmd.pbsnodes("-r", node.hostname) ret.append(node) except SubprocessError as e: logging.error( "Could not fully add %s to cluster: %s. Will attempt next cycle", node, e, ) return ret
from hpc.autoscale import hpclogging as logging _QCMD_LOGGER = logging.getLogger("gridengine.driver") _QCONF_PATH = which("qconf") or "" _QMOD_PATH = which("qmod") or "" _QSELECT_PATH = which("qselect") or "" _QSTAT_PATH = which("qstat") or "" __VALIDATED = False if not __VALIDATED: for key, value in list(globals().items()): if key.startswith("_Q") and key.endswith("_PATH"): if not value: executable = key.split("_")[0].lower() logging.error("Could not find %s in PATH: %s", executable, os.environ.get("PATH")) __VALIDATED = True def check_call(cmd: List[str], *args: Any, **kwargs: Any) -> None: shlexed = " ".join([shlex.quote(x) for x in cmd]) logging.trace("Running '%s'", shlexed) _QCMD_LOGGER.info(shlexed) try: _check_call(cmd, *args, **kwargs) except Exception as e: logging.error("'%s' failed: %s", shlexed, str(e)) _QCMD_LOGGER.error(">> %s", str(e)) raise
def _parse_complexes( autoscale_config: Dict, complex_lines: List[str] ) -> Dict[str, "Complex"]: relevant_complexes = None if autoscale_config: relevant_complexes = autoscale_config.get("gridengine", {}).get( "relevant_complexes" ) if relevant_complexes: # special handling of ccnodeid, since it is something we # create for the user relevant_complexes = relevant_complexes + ["ccnodeid"] if relevant_complexes: logging.info( "Restricting complexes for autoscaling to %s", relevant_complexes ) complexes: List[Complex] = [] headers = complex_lines[0].lower().replace("#", "").split() required = set(["name", "type", "consumable"]) missing = required - set(headers) if missing: logging.error( "Could not parse complex file as it is missing expected columns: %s." + " Autoscale likely will not work.", list(missing), ) return {} for n, line in enumerate(complex_lines[1:]): if line.startswith("#"): continue toks = line.split() if len(toks) != len(headers): logging.warning( "Could not parse complex at line {} - ignoring: '{}'".format(n, line) ) continue c = dict(zip(headers, toks)) try: if ( relevant_complexes and c["name"] not in relevant_complexes and c["shortcut"] not in relevant_complexes ): logging.trace( "Ignoring complex %s because it was not defined in gridengine.relevant_complexes", c["name"], ) continue complex = Complex( name=c["name"], shortcut=c.get("shortcut", c["name"]), complex_type=c["type"], relop=c.get("relop", "=="), requestable=c.get("requestable", "YES").lower() == "yes", consumable=c.get("consumable", "YES").lower() == "yes", default=c.get("default"), urgency=int(c.get("urgency", 0)), ) complexes.append(complex) except Exception: logging.exception("Could not parse complex %s - %s", line, c) # TODO test RDH ret = partition_single(complexes, lambda x: x.name) shortcut_dict = partition_single(complexes, lambda x: x.shortcut) ret.update(shortcut_dict) return ret
def main(argv: Iterable[str] = None) -> None: default_install_dir = os.path.join("/", "opt", "cycle", "gridengine") parser = ArgumentParser() sub_parsers = parser.add_subparsers() def csv_list(x: str) -> List[str]: return [x.strip() for x in x.split(",")] help_msg = io.StringIO() def add_parser(name: str, func: Callable, read_only: bool = True, skip_config: bool = False) -> ArgumentParser: doc_str = (func.__doc__ or "").strip() doc_str = " ".join([x.strip() for x in doc_str.splitlines()]) help_msg.write("\n {:20} - {}".format(name, doc_str)) default_config: Optional[str] default_config = os.path.join(default_install_dir, "autoscale.json") if not os.path.exists(default_config): default_config = None new_parser = sub_parsers.add_parser(name) new_parser.set_defaults(func=func, read_only=read_only) if skip_config: return new_parser new_parser.add_argument("--config", "-c", default=default_config, required=not bool(default_config)) return new_parser def str_list(c: str) -> List[str]: return c.split(",") def add_parser_with_columns(name: str, func: Callable, read_only: bool = True) -> ArgumentParser: parser = add_parser(name, func, read_only) def parse_format(c: str) -> str: c = c.lower() if c in ["json", "table", "table_headerless"]: return c print("Expected json, table or table_headerless - got", c, file=sys.stderr) sys.exit(1) parser.add_argument("--output-columns", "-o", type=str_list) parser.add_argument("--output-format", "-F", type=parse_format) return parser add_parser_with_columns("autoscale", autoscale, read_only=False) add_parser_with_columns("buckets", buckets).add_argument("--constraint-expr", "-C", default="[]") add_parser("complexes", complexes).add_argument("-a", "--include-irrelevant", action="store_true", default=False) delete_parser = add_parser("delete_nodes", delete_nodes, read_only=False) delete_parser.add_argument("-H", "--hostnames", type=str_list, default=[]) delete_parser.add_argument("-N", "--node-names", type=str_list, default=[]) delete_parser.add_argument("--force", action="store_true", default=False) remove_parser = add_parser("remove_nodes", remove_nodes, read_only=False) remove_parser.add_argument("-H", "--hostnames", type=str_list, default=[]) remove_parser.add_argument("-N", "--node-names", type=str_list, default=[]) remove_parser.add_argument("--force", action="store_true", default=False) add_parser_with_columns("demand", demand).add_argument("--jobs", "-j", default=None, required=False) add_parser("drain_node", drain_node, read_only=False).add_argument("-H", "--hostname", required=True) initconfig_parser = add_parser("initconfig", initconfig, read_only=False, skip_config=True) initconfig_parser.add_argument("--cluster-name", required=True) initconfig_parser.add_argument("--username", required=True) initconfig_parser.add_argument("--password") initconfig_parser.add_argument("--url", required=True) initconfig_parser.add_argument( "--log-config", default=os.path.join(default_install_dir, "logging.conf"), dest="logging__config_file", ) initconfig_parser.add_argument("--lock-file", default=os.path.join( default_install_dir, "scalelib.lock")) initconfig_parser.add_argument( "--default-resource", type=json.loads, action="append", default=[], dest="default_resources", ) initconfig_parser.add_argument( "--default-hostgroups", type=json.loads, action="append", default=[], dest="default_hostgroups", ) initconfig_parser.add_argument( "--relevant-complexes", default=["slots", "slot_type", "exclusive"], type=csv_list, dest="gridengine__relevant_complexes", ) initconfig_parser.add_argument("--idle-timeout", default=300, type=int, dest="idle_timeout") initconfig_parser.add_argument("--boot-timeout", default=1800, type=int, dest="boot_timeout") initconfig_parser.add_argument( "--disable-pgs-for-pe", default=[], type=str, action="append", help="Disable creation of placement groups for a parallel environment. " + "This can be invoked more than once.", dest="disable_pgs_for_pe", ) initconfig_parser.add_argument( "--hostgroup-constraint", default=[], action="append", dest="hostgroup_constraints", ) add_parser("jobs", jobs) add_parser("jobs_and_nodes", jobs_and_nodes) support_archive_parser = add_parser("support_archive", create_support_archive) support_archive_parser.add_argument( "--archive", "-a", default="gridengine_support-{}.tar.gz".format(time.time())) join_cluster_parser = add_parser("join_cluster", join_cluster) join_cluster_parser.add_argument("-H", "--hostnames", type=str_list) join_cluster_parser.add_argument("-N", "--nodenames", type=str_list) add_parser_with_columns("nodes", nodes).add_argument("--constraint-expr", "-C", default="[]") add_parser("scheduler_nodes", scheduler_nodes) help_msg.write("\nadvanced usage:") add_parser("validate", validate_func, read_only=True) add_parser("queues", queues, read_only=True) add_parser("shell", shell) analyze_parser = add_parser("analyze", analyze) analyze_parser.add_argument("--job-id", "-j", required=True) analyze_parser.add_argument("--wide", "-w", action="store_true", default=False) parser.usage = help_msg.getvalue() args = parser.parse_args() if not hasattr(args, "func"): parser.print_help() sys.exit(1) # parse list of config paths to a single config if hasattr(args, "config"): try: with open(args.config) as fr: args.config = json.load(fr) except Exception as e: logging.error("Could not load config file %s: %s", args.config, e) sys.exit(1) logging.initialize_logging(args.config) if args.read_only: args.config["read_only"] = True args.config["lock_file"] = None kwargs = {} for k in dir(args): if k[0].islower() and k not in ["read_only", "func"]: kwargs[k] = getattr(args, k) try: args.func(**kwargs) except Exception as e: print(str(e), file=sys.stderr) if hasattr(e, "message"): print(getattr(e, "message"), file=sys.stderr) logging.debug("Full stacktrace", exc_info=sys.exc_info()) sys.exit(1)