def _bash_login_cmd(cmd: List[str]) -> List[str]: """Return the given command as a bash login shell command. This allows users to set env vars. Example: >>> HostSelector._bash_login_cmd(["echo", "-n", "Multiple words"]) ['bash', '-l', '-c', "echo -n 'Multiple words'"] """ return ['bash', '-l', '-c', RosePopener.shlex_join(cmd)]
def write_source_vc_info(run_source_dir, output=None, popen=None): """Write version control information of sources used in run time. run_source_dir -- The source directory we are interested in. output -- An open file handle or a string containing a writable path. If not specified, use sys.stdout. popen -- A metomi.rose.popen.RosePopener instance for running vc commands. If not specified, use a new local instance. """ if popen is None: popen = RosePopener() if output is None: handle = sys.stdout elif hasattr(output, "write"): handle = output else: handle = open(output, "wb") msg = "%s\n" % run_source_dir write_safely(msg, handle) environ = dict(os.environ) environ["LANG"] = "C" for vcs, args_list in [ ( "svn", [ ["info", "--non-interactive"], ["status", "--non-interactive"], ["diff", "--internal-diff", "--non-interactive"], ], ), ("git", [["describe"], ["status"], ["diff"]]), ]: if not popen.which(vcs): continue cwd = os.getcwd() os.chdir(run_source_dir) try: for args in args_list: cmd = [vcs, *args] ret_code, out, _ = popen.run(*cmd, env=environ) if out: write_safely(("#" * 80 + "\n"), handle) write_safely(("# %s\n" % popen.shlex_join(cmd)), handle) write_safely(("#" * 80 + "\n"), handle) write_safely(out, handle) if ret_code: # If cmd fails once, it will likely fail again break finally: os.chdir(cwd)
def select( self, names=None, rank_method=None, thresholds=None, ssh_cmd_timeout=None, ): """Return a list. Element 0 is most desirable. Each element of the list is a tuple (host, score). names: a list of known host groups or host names. rank_method: the ranking method. Can be one of: load:1, load:5, load:15 (=load =default), fs:FS and random. The "load" methods determines the load using the average load as returned by the "uptime" command divided by the number of CPUs. The "fs" method determines the load using the usage in the file system specified by FS. The "mem" method ranks by highest free memory. The "random" method ranks everything by random. thresholds: a list of thresholds which each host must not exceed. Should be in the format rank_method:value, where rank_method is one of load:1, load:5, load:15 or fs:FS; and value is number that must be be exceeded. ssh_cmd_timeout: timeout of SSH commands to hosts. A float in seconds. """ host_names, rank_method, thresholds = self.expand( names, rank_method, thresholds) # Load scorers, ranking and thresholds rank_method_arg = None if rank_method: if ":" in rank_method: rank_method, rank_method_arg = rank_method.split(":", 1) else: rank_method = self.RANK_METHOD_DEFAULT rank_conf = ScorerConf(self.get_scorer(rank_method), rank_method_arg) self.handle_event(RankMethodEvent(rank_method, rank_method_arg)) threshold_confs = [] if thresholds: for threshold in thresholds: method = self.RANK_METHOD_DEFAULT method_arg = None value = threshold if ":" in threshold: head, value = threshold.rsplit(":", 1) method = head if ":" in head: method, method_arg = head.split(":", 1) try: float(value) except ValueError: raise ValueError(threshold) scorer = self.get_scorer(method) if method_arg is None: method_arg = scorer.ARG threshold_conf = ScorerConf(self.get_scorer(method), method_arg, value) threshold_confs.append(threshold_conf) if ssh_cmd_timeout is None: conf = ResourceLocator.default().get_conf() ssh_cmd_timeout = float( conf.get_value(["rose-host-select", "timeout"], self.SSH_CMD_TIMEOUT)) host_name_list = list(host_names) host_names = [] for host_name in host_name_list: if self.is_local_host(host_name): if self.get_local_host() not in host_names: host_names.append(self.get_local_host()) else: host_names.append(host_name) # Random selection with no thresholds. Return the 1st available host. if rank_conf.method == self.RANK_METHOD_RANDOM and not threshold_confs: shuffle(host_names) for host_name in host_names: if self.is_local_host(host_name): return [("localhost", 1)] command = self.popen.get_cmd("ssh", host_name, "true") proc = self.popen.run_bg(*command, preexec_fn=os.setpgrp) time0 = time() while (proc.poll() is None and time() - time0 <= ssh_cmd_timeout): sleep(self.SSH_CMD_POLL_DELAY) if proc.poll() is None: os.killpg(proc.pid, signal.SIGTERM) proc.wait() self.handle_event(TimedOutHostEvent(host_name)) elif proc.wait(): self.handle_event( HostSelectCommandFailedEvent(host_name, proc.returncode)) else: return [(host_name, 1)] else: raise NoHostSelectError() # ssh to each host to return its score(s). host_proc_dict = {} for host_name in sorted(host_names): # build host-select-client command command: List[str] = [] # pass through CYLC_VERSION to support use of cylc wrapper script try: import cylc.flow except ModuleNotFoundError: pass else: command.extend([ 'env', f'CYLC_VERSION={cylc.flow.__version__}', ]) cylc_env_name = os.getenv('CYLC_ENV_NAME') if cylc_env_name: command.append(f'CYLC_ENV_NAME={cylc_env_name}') command.extend(self._bash_login_cmd(['rose', 'host-select-client'])) # build list of metrics to obtain for each host metrics = rank_conf.get_command() for threshold_conf in threshold_confs: for metric in threshold_conf.get_command(): if metric not in metrics: metrics.append(metric) # convert metrics list to JSON stdin stdin = '\n***start**\n' + json.dumps(metrics) + '\n**end**\n' if not self.is_local_host(host_name): command = [ *self.popen.get_cmd('ssh', host_name), RosePopener.shlex_join(command) ] # fire off host-select-client processes proc = self.popen.run_bg(*command, stdin=stdin, preexec_fn=os.setpgrp) proc.stdin.write(stdin) proc.stdin.flush() host_proc_dict[host_name] = (proc, metrics) # Retrieve score for each host name host_score_list = [] time0 = time() while host_proc_dict: sleep(self.SSH_CMD_POLL_DELAY) for host_name, (proc, metrics) in list(host_proc_dict.items()): if proc.poll() is None: # still running continue stdout, stderr = proc.communicate() if proc.returncode: self.handle_event( HostSelectCommandFailedEvent(host_name, proc.returncode, stderr)) host_proc_dict.pop(host_name) else: out = _deserialise(metrics, json.loads(stdout.strip())) host_proc_dict.pop(host_name) for threshold_conf in threshold_confs: try: score = threshold_conf.command_out_parser( out, metrics) is_bad = threshold_conf.check_threshold(score) except ValueError: is_bad = True score = None if is_bad: self.handle_event( HostThresholdNotMetEvent( host_name, threshold_conf, score)) break else: try: score = rank_conf.command_out_parser(out, metrics) host_score_list.append((host_name, score)) except ValueError: score = None self.handle_event( HostSelectScoreEvent(host_name, score)) if time() - time0 > ssh_cmd_timeout: break # Report timed out hosts for host_name, (proc, _) in sorted(host_proc_dict.items()): self.handle_event(TimedOutHostEvent(host_name)) os.killpg(proc.pid, signal.SIGTERM) proc.wait() if not host_score_list: raise NoHostSelectError() host_score_list.sort(key=lambda a: a[1], reverse=rank_conf.scorer.SIGN < 0) return host_score_list