def get(self, remotepaths, localpath=''): """ Copies one or more files from the remote host to the local host. """ remotepaths = self._make_list(remotepaths) localpath = localpath or os.getcwd() globs = [] noglobs = [] for rpath in remotepaths: if glob.has_magic(rpath): globs.append(rpath) else: noglobs.append(rpath) globresults = [self.glob(g) for g in globs] remotepaths = noglobs for globresult in globresults: remotepaths.extend(globresult) recursive = False for rpath in remotepaths: if not self.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) for rpath in remotepaths: if self.isdir(rpath): recursive = True break try: self.scp.get(remotepaths, local_path=localpath, recursive=recursive) except Exception, e: log.debug("get failed: remotepaths=%s, localpath=%s", str(remotepaths), localpath) raise exception.SCPException(str(e))
def connect(self, host=None, username=None, password=None, private_key=None, private_key_pass=None, port=None, timeout=30, compress=None): host = host or self._host username = username or self._username password = password or self._password compress = compress or self._compress port = port if port is not None else self._port pkey = self._pkey if private_key: pkey = self.load_private_key(private_key, private_key_pass) log.debug("connecting to host %s on port %d as user %s" % (host, port, username)) try: sock = self._get_socket(host, port) transport = paramiko.Transport(sock) transport.banner_timeout = timeout except socket.error: raise exception.SSHConnectionError(host, port) # Enable/disable compression transport.use_compression(compress) # Authenticate the transport. try: transport.connect(username=username, pkey=pkey, password=password) except paramiko.AuthenticationException: raise exception.SSHAuthException(username, host) except paramiko.SSHException, e: msg = e.args[0] raise exception.SSHError(msg)
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def scp(self): """Initialize the SCP client.""" if not self._scp or not self._scp.transport.is_active(): log.debug("creating scp connection") self._scp = scp.SCPClient(self.transport, progress=self._file_transfer_progress, socket_timeout=self._timeout) return self._scp
def _load_rsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: rsa_key = get_rsa_key(key_location=private_key_file, passphrase=private_key_pass) log.debug("Using private key %s (RSA)" % private_key) return rsa_key except (paramiko.SSHException, exception.SSHError): log.error('invalid rsa key or passphrase specified')
def switch_user(self, user): """ Reconnect, if necessary, to host as user """ if not self.is_active() or user and self.get_current_user() != user: self.connect(username=user) else: user = user or self._username log.debug("already connected as user %s" % user)
def _get_fp(self, cfg_file): log.debug("Loading file: %s" % cfg_file) if os.path.exists(cfg_file): if not os.path.isfile(cfg_file): raise exception.ConfigError( 'config %s exists but is not a regular file' % cfg_file) else: raise exception.ConfigNotFound( "config file %s does not exist\n" % cfg_file, cfg_file) return open(cfg_file)
def _parse_job(self, job, queue_name=None): jstate = job.getAttribute("state") jdict = dict(job_state=jstate, queue_name=queue_name) for node in job.childNodes: if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE: for child in node.childNodes: jdict[node.nodeName] = child.data num_tasks = self._count_tasks(jdict) log.debug("Job contains %d tasks" % num_tasks) return [jdict] * num_tasks
def _get_urlfp(self, url): log.debug("Loading url: %s" % url) try: fp = urllib.urlopen(url) if fp.getcode() == 404: raise exception.ConfigError("url %s does not exist" % url) fp.name = url return fp except IOError, e: raise exception.ConfigError( "error loading config from url %s\n%s" % (url, e))
def remove_lines_from_file(self, remote_file, regex): """ Removes lines matching regex from remote_file """ if regex in [None, '']: log.debug('no regex supplied...returning') return lines = self.get_remote_file_lines(remote_file, regex, matching=False) log.debug("new %s after removing regex (%s) matches:\n%s" % (remote_file, regex, ''.join(lines))) f = self.remote_file(remote_file) f.writelines(lines) f.close()
def graph(self, yaxis, title): if self.records is None: log.error("ERROR: File hasn't been read() yet.") return -1 fig = plt.figure() ax = fig.add_subplot(111) ax.plot(self.records.dt, yaxis) ax.grid(True) fig.autofmt_xdate() filename = os.path.join(self.pngpath, title + '.png') plt.savefig(filename, dpi=100) log.debug("saved graph %s." % title) plt.close(fig) # close it when its done
def is_node_working(self, node): """ This function returns true if the node is currently working on a task, or false if the node is currently idle. """ nodename = node.alias for j in self.jobs: qn = j.get('queue_name', '') if nodename in qn: log.debug("Node %s is working" % node.alias) return True log.debug("Node %s is IDLE" % (node.id,)) return False
def _load_defaults(self, settings, store): """ Sets the default for each setting in settings regardless of whether the setting was specified in the config or not. """ section_conf = store for setting in settings: default = settings[setting][2] if section_conf.get(setting) is None: if DEBUG_CONFIG: log.debug('%s setting not specified. Defaulting to %s' % (setting, default)) section_conf[setting] = default
def load_private_key(self, private_key, private_key_pass=None): # Use Private Key. log.debug('loading private key %s' % private_key) if private_key.endswith('rsa') or private_key.count('rsa'): pkey = self._load_rsa_key(private_key, private_key_pass) elif private_key.endswith('dsa') or private_key.count('dsa'): pkey = self._load_dsa_key(private_key, private_key_pass) else: log.debug( "specified key does not end in either rsa or dsa, trying both") pkey = self._load_rsa_key(private_key, private_key_pass) if pkey is None: pkey = self._load_dsa_key(private_key, private_key_pass) return pkey
def execute(self, command, silent=True, only_printable=False, ignore_exit_status=False, log_output=True, detach=False, source_profile=True, raise_on_failure=True): """ Execute a remote command and return stdout/stderr NOTE: this function blocks until the process finishes kwargs: silent - don't print the command's output to the console only_printable - filter the command's output to allow only printable characters ignore_exit_status - don't warn about non-zero exit status log_output - log all remote output to the debug file detach - detach the remote process so that it continues to run even after the SSH connection closes (does NOT return output or check for non-zero exit status if detach=True) source_profile - if True prefix the command with "source /etc/profile" raise_on_failure - raise exception.SSHError if command fails returns List of output lines """ channel = self.transport.open_session() if detach: command = "nohup %s &" % command if source_profile: command = "source /etc/profile && %s" % command channel.exec_command(command) channel.close() self.__last_status = None return if source_profile: command = "source /etc/profile && %s" % command log.debug("executing remote command: %s" % command) channel.exec_command(command) output = self._get_output(channel, silent=silent, only_printable=only_printable) exit_status = channel.recv_exit_status() self.__last_status = exit_status out_str = '\n'.join(output) if exit_status != 0: msg = "remote command '%s' failed with status %d" msg %= (command, exit_status) if log_output: msg += ":\n%s" % out_str else: msg += " (no output log requested)" if not ignore_exit_status: if raise_on_failure: raise exception.RemoteCommandFailed( msg, command, exit_status, out_str) else: log.error(msg) else: log.debug("(ignored) " + msg) else: if log_output: log.debug("output of '%s':\n%s" % (command, out_str)) else: log.debug("output of '%s' has been hidden" % command) return output
def get_qatime(self, now): """ This function takes the lookback window and creates a string representation of the past few hours, to feed to qacct to limit the dataset qacct returns. """ if self.stat.is_jobstats_empty(): log.info("Loading full job history") temp_lookback_window = self.lookback_window * 60 * 60 else: temp_lookback_window = self.polling_interval log.debug("getting past %d seconds worth of job history" % temp_lookback_window) now = now - datetime.timedelta(seconds=temp_lookback_window + 1) return now.strftime("%Y%m%d%H%M")
def put(self, localpaths, remotepath='.'): """ Copies one or more files from the local host to the remote host. """ localpaths = self._make_list(localpaths) recursive = False for lpath in localpaths: if os.path.isdir(lpath): recursive = True break try: self.scp.put(localpaths, remote_path=remotepath, recursive=recursive) except Exception, e: log.debug("put failed: localpaths=%s, remotepath=%s", str(localpaths), remotepath) raise exception.SCPException(str(e))
def get_private_rsa_fingerprint(key_location=None, key_file_obj=None, passphrase=None, digest='sha1', format='DER', pkcs=8, with_colons=True): """ Returns the fingerprint of a private RSA key as a 59-character string (40 characters separated every 2 characters by a ':'). The fingerprint is computed using the SHA1 (hex) digest of the DER-encoded (pkcs8) RSA private key. """ k = get_rsa_key(key_location=key_location, key_file_obj=key_file_obj, passphrase=passphrase, use_pycrypto=True) digest_func = getattr(hashlib, digest) digest_result = digest_func(k.exportKey(format, pkcs=pkcs)).hexdigest() # sha1digest = hashlib.sha1(k.exportKey('DER', pkcs=8)).hexdigest() fingerprint = insert_char_every_n_chars(digest_result, ':', 2) if with_colons else digest_result key = key_location or key_file_obj log.debug("rsa private key fingerprint (%s): %s" % (key, fingerprint)) return fingerprint
def _should_remove(self, node): """ Determines whether a node is eligible to be removed based on: 1. The node must not be running any SGE job 2. The node must have been up for self.kill_after min past the hour """ if self.stat.is_node_working(node): return False mins_up = self._minutes_uptime(node) % 60 idle_msg = ("Idle node %s (%s) has been up for %d minutes past " "the hour" % (node.alias, node.id, mins_up)) if mins_up >= self.kill_after: log.info(idle_msg) return True else: log.debug(idle_msg) return False
def get_public_rsa_fingerprint(key_location=None, key_file_obj=None, passphrase=None, digest='md5', format='DER', with_colons=True): """ Returns the fingerprint of the public portion of an RSA key as a 47-character string (32 characters separated every 2 characters by a ':'). The fingerprint is computed using the MD5 (hex) digest of the DER-encoded RSA public key. """ privkey = get_rsa_key(key_location=key_location, key_file_obj=key_file_obj, passphrase=passphrase, use_pycrypto=True) pubkey = privkey.publickey() digest_func = getattr(hashlib, digest) # md5digest = hashlib.md5(pubkey.exportKey(format)).hexdigest() digest_result = digest_func(pubkey.exportKey(format)).hexdigest() fingerprint = insert_char_every_n_chars(digest_result, ':', 2) if with_colons else digest_result key = key_location or key_file_obj log.debug("rsa public key fingerprint (%s): %s" % (key, fingerprint)) return fingerprint
def get_certificate_fingerprint(cert=None, cert_location=None, cert_file_obj=None, passphrase=None, digest='sha1', format=crypto.FILETYPE_ASN1, pkcs=8, with_colons=False): """ Returns the fingerprint of a certificate as a 59-character string (40 characters separated every 2 characters by a ':'). The fingerprint is computed using the SHA1 (hex) digest of the DER-encoded (pkcs8) certificate. """ cert = cert or get_certificate(cert_location=cert_location, cert_file_obj=cert_file_obj) fingerprint = cert.digest(digest) if not with_colons: fingerprint = fingerprint.replace(':', '') # key = get_openssl_key(key_location, key_file_obj) # cert.sign(key, 'sha1') # digest_func = getattr(hashlib, digest) # digest_result = digest_func(export_certificate(cert)).hexdigest() # fingerprint = insert_char_every_n_chars(digest_result, ':', 2) if with_colons else digest_result cert = cert_location or cert_file_obj log.debug("certificate fingerprint (%s): %s" % (cert, fingerprint)) return fingerprint
def parse_qacct(self, string, dtnow): """ This method parses qacct -j output and makes a neat array and calculates some statistics. Takes the string to parse, and a datetime object of the remote host's current time. """ job_id = None qd = None start = None end = None counter = 0 lines = string.split('\n') for l in lines: l = l.strip() if l.find('jobnumber') != -1: job_id = int(l[13:len(l)]) if l.find('qsub_time') != -1: qd = self.qacct_to_datetime_tuple(l[13:len(l)]) if l.find('start_time') != -1: if l.find('-/-') > 0: start = dtnow else: start = self.qacct_to_datetime_tuple(l[13:len(l)]) if l.find('end_time') != -1: if l.find('-/-') > 0: end = dtnow else: end = self.qacct_to_datetime_tuple(l[13:len(l)]) if l.find('==========') != -1: if qd is not None: self.max_job_id = job_id hash = {'queued': qd, 'start': start, 'end': end} self.jobstats[job_id % self.jobstat_cachesize] = hash qd = None start = None end = None counter = counter + 1 log.debug("added %d new jobs" % counter) log.debug("There are %d items in the jobstats cache" % len(self.jobstats)) return self.jobstats
def _count_tasks(self, jdict): """ This function returns the number of tasks in a task array job. For example, 'qsub -t 1-20:1' returns 20. """ tasks = jdict.get('tasks', '').split(',') num_tasks = 0 for task in tasks: if '-' in task: regex = "(\d+)-?(\d+)?:?(\d+)?" r = re.compile(regex) start, end, step = r.match(task).groups() start = int(start) end = int(end) step = int(step) if step else 1 num_tasks += (end - start) / step + 1 else: num_tasks += 1 log.debug("task array job has %s tasks (tasks: %s)" % (num_tasks, tasks)) return num_tasks
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException( "Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def load(self): """ Populate this config object from the TethysCluster config """ log.debug('Loading config') try: self.globals = self._load_section('global', self.global_settings) except exception.ConfigSectionMissing: pass no_aws = False no_azure = False try: self.aws = self._load_section('aws info', self.aws_settings) except exception.ConfigSectionMissing: no_aws = True try: self.azure = self._load_section('azure info', self.azure_settings) except exception.ConfigSectionMissing: no_azure = True if no_aws and no_azure: log.info( "No [aws info] nor [azure info] section found in the config!") self.aws.update(self.get_settings_from_env(self.aws_settings)) self.azure.update(self.get_settings_from_env(self.azure_settings)) self.keys = self._load_sections('key', self.key_settings) self.vols = self._load_sections('volume', self.volume_settings) self.vols.update(self._load_sections('vol', self.volume_settings)) self.plugins = self._load_sections('plugin', self.plugin_settings, filter_settings=False) self._load_plugins() self.permissions = self._load_sections('permission', self.permission_settings) sections = self._get_sections('cluster') self.clusters = self._load_cluster_sections(sections) return self
def _get_stats(self): master = self._cluster.master_node now = self.get_remote_time() qatime = self.get_qatime(now) qacct_cmd = 'qacct -j -b ' + qatime qstat_cmd = 'qstat -u \* -xml -f -r' qhostxml = '\n'.join(master.ssh.execute('qhost -xml')) qstatxml = '\n'.join(master.ssh.execute(qstat_cmd)) try: qacct = '\n'.join(master.ssh.execute(qacct_cmd)) except exception.RemoteCommandFailed: if master.ssh.isfile('/opt/sge6/default/common/accounting'): raise else: log.info("No jobs have completed yet!") qacct = '' self.stat.parse_qhost(qhostxml) self.stat.parse_qstat(qstatxml) self.stat.parse_qacct(qacct, now) log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" % (len(qhostxml), len(qstatxml), len(qacct))) return self.stat
def _load_extends_settings(self, section_name, store): """ Loads all settings from other template(s) specified by a section's 'extends' setting. This method walks a dependency tree of sections from bottom up. Each step is a group of settings for a section in the form of a dictionary. A 'master' dictionary is updated with the settings at each step. This causes the next group of settings to override the previous, and so on. The 'section_name' settings are at the top of the dependency tree. """ section = store[section_name] extends = section.get('extends') if extends is None: return if DEBUG_CONFIG: log.debug('%s extends %s' % (section_name, extends)) extensions = [section] while extends is not None: try: section = store[extends] if section in extensions: exts = ', '.join([ self._get_section_name(x['__name__']) for x in extensions ]) raise exception.ConfigError( "Cyclical dependency between sections %s. " "Check your EXTENDS settings." % exts) extensions.insert(0, section) except KeyError: raise exception.ConfigError( "%s can't extend non-existent section %s" % (section_name, extends)) extends = section.get('extends') transform = AttributeDict() for extension in extensions: transform.update(extension) store[section_name] = transform
def _add_user_to_node(self, uid, gid, node): existing_user = node.getpwuid(uid) if existing_user: username = existing_user.pw_name if username != self._user: msg = ("user %s exists on %s with same uid/gid as " "cluster user %s...removing user %s") log.debug(msg % (username, node.alias, self._user, username)) node.remove_user(username) node.add_user(self._user, uid, gid, self._user_shell) log.debug("user %s exists on node %s, no action" % (self._user, node.alias)) else: log.debug("user %s does not exist, creating..." % self._user) node.add_user(self._user, uid, gid, self._user_shell)
exc_info=True) sys.exit(1) except socket.error, e: log.exception("Connection error:") log.error("Check your internet connection?") sys.exit(1) except exception.ThreadPoolException, e: log.error(e.format_excs()) self.bug_found() except exception.ClusterDoesNotExist, e: cm = gopts.CONFIG.get_cluster_manager() cls = '' try: cls = cm.get_clusters(load_plugins=False, load_receipt=False) except: log.debug("Error fetching cluster list", exc_info=True) log.error(e.msg) if cls: taglist = ', '.join([c.cluster_tag for c in cls]) active_clusters = "(active clusters: %s)" % taglist log.error(active_clusters) sys.exit(1) except exception.BaseException, e: log.error(e.msg, extra={'__textwrap__': True}) log.debug(e.msg, exc_info=True) sys.exit(1) except SystemExit: # re-raise SystemExit to avoid the bug-catcher below raise except Exception: log.error("Unhandled exception occured", exc_info=True)
setup_class) if not issubclass(klass, clustersetup.ClusterSetup): raise exception.PluginError( "Plugin %s must be a subclass of " "tethyscluster.clustersetup.ClusterSetup" % setup_class) args, kwargs = utils.get_arg_spec(klass.__init__, debug=DEBUG_CONFIG) config_args = [] missing_args = [] for arg in args: if arg in plugin: config_args.append(plugin.get(arg)) else: missing_args.append(arg) if DEBUG_CONFIG: log.debug("config_args = %s" % config_args) if missing_args: raise exception.PluginError( "Not enough settings provided for plugin %s (missing: %s)" % (plugin_name, ', '.join(missing_args))) config_kwargs = {} for arg in kwargs: if arg in plugin: config_kwargs[arg] = plugin.get(arg) if DEBUG_CONFIG: log.debug("config_kwargs = %s" % config_kwargs) try: plug_obj = klass(*config_args, **config_kwargs) except Exception as exc: log.error("Error occured:", exc_info=True) raise exception.PluginLoadError(