def __init__(self, num_users=None, usernames=None, download_keys=None, download_keys_dir=None): if usernames: usernames = [user.strip() for user in usernames.split(',')] if num_users: try: num_users = int(num_users) except ValueError: raise exception.BaseException("num_users must be an integer") elif usernames: num_users = len(usernames) else: raise exception.BaseException( "you must provide num_users or usernames or both") if usernames and num_users and len(usernames) != num_users: raise exception.BaseException( "only %d usernames provided - %d required" % (len(usernames), num_users)) self._num_users = num_users if not usernames: usernames = ['user%.3d' % i for i in range(1, num_users + 1)] self._usernames = usernames self._download_keys = str(download_keys).lower() == "true" self._download_keys_dir = download_keys_dir or self.DOWNLOAD_KEYS_DIR super(CreateUsers, self).__init__()
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def _mkdir(self, directory, makedirs=False): if not os.path.isdir(directory): if os.path.isfile(directory): raise exception.BaseException("'%s' is a file not a directory") try: if makedirs: os.makedirs(directory) log.info("Created directories %s" % directory) else: os.mkdir(directory) log.info("Created single directory %s" % directory) except IOError, e: raise exception.BaseException(str(e))
def get(self, remotepaths, localpath=''): """ Copies one or more files from the remote host to the local host. """ remotepaths = self._make_list(remotepaths) localpath = localpath or os.getcwd() globs = [] noglobs = [] for rpath in remotepaths: if glob.has_magic(rpath): globs.append(rpath) else: noglobs.append(rpath) globresults = [self.glob(g) for g in globs] remotepaths = noglobs for globresult in globresults: remotepaths.extend(globresult) recursive = False for rpath in remotepaths: if not self.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) for rpath in remotepaths: if self.isdir(rpath): recursive = True break try: self.scp.get(remotepaths, local_path=localpath, recursive=recursive) except Exception, e: log.debug("get failed: remotepaths=%s, localpath=%s", str(remotepaths), localpath) raise exception.SCPException(str(e))
def _validate_dir(self, dirname, msg_prefix=""): if not os.path.isdir(dirname): msg = "'%s' is not a directory" if not os.path.exists(dirname): msg = "'%s' does not exist" if msg_prefix: msg = ' '.join([msg_prefix, msg]) msg = msg % dirname raise exception.BaseException(msg)
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, local files or " + "directories, and a remote destination path") ctag = args[0] rpath = args[-1] lpaths = args[1:-1] for lpath in lpaths: if not os.path.exists(lpath): raise exception.BaseException( "Local file or directory does not exist: %s" % lpath) cl = self.cm.get_cluster(ctag, load_receipt=False) node = cl.get_node(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) if len(lpaths) > 1 and not node.ssh.isdir(rpath): raise exception.BaseException("Remote path does not exist: %s" % rpath) node.ssh.put(lpaths, rpath)
def write_stats_to_csv(self, filename): """ Write important SGE stats to CSV file Appends one line to the CSV """ bits = self.get_all_stats() try: f = open(filename, 'a') flat = ','.join(str(n) for n in bits) + '\n' f.write(flat) f.close() except IOError, e: raise exception.BaseException(str(e))
def visualizer(self): if not self._visualizer: try: from tethyscluster.balancers.sge import visualizer except ImportError, e: log.error("Error importing visualizer:") log.error(str(e)) log.error("check that matplotlib and numpy are installed and:") log.error(" $ python -c 'import matplotlib'") log.error(" $ python -c 'import numpy'") log.error("completes without error") raise exception.BaseException( "Failed to load stats visualizer") self._visualizer = visualizer.SGEVisualizer(self.stats_file, self.plot_output_dir)
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag, load_receipt=False) node = cl.get_node(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def slots_per_host(self): """ Returns the number of slots per host. If for some reason the cluster is inconsistent, this will return -1 for example, if you have m1.large and m1.small in the same cluster """ total = self.count_total_slots() if total == 0: return total single = 0 for q in self.queues: if q.startswith('all.q@'): single = self.queues.get(q).get('slots') break if (total != (single * len(self.hosts))): raise exception.BaseException( "ERROR: Number of slots not consistent across cluster") return single
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException( "Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def _setup_cluster_user(self, user=None): """ Create cluster user on all TethysCluster nodes This command takes care to examine existing folders in /home and set the new cluster_user's uid/gid accordingly. This is necessary for the case of EBS volumes containing /home with large amounts of data in them. It's much less expensive in this case to set the uid/gid of the new user to be the existing uid/gid of the dir in EBS rather than chowning potentially terabytes of data. """ user = user or self._user uid, gid = self._get_new_user_id(user) if uid == 0 or gid == 0: raise exception.BaseException( "Cannot create user: {0:s} (uid: {1:1d}, gid: {2:1d}). This " "is caused by /home/{0:s} directory being owned by root. To " "fix this you'll need to create a new AMI. Note that the " "instance is still up.".format(user, uid, gid)) log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (user, uid, gid)) self._add_user_to_nodes(uid, gid, self._nodes)
def run(self, cluster): """ This function will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It looks at the job queue and tries to decide whether to add or remove a node. It should later look at job durations (currently doesn't) """ self._cluster = cluster if self.max_nodes is None: self.max_nodes = cluster.cluster_size if self.min_nodes is None: self.min_nodes = 1 if self.kill_cluster: self.min_nodes = 0 if self.min_nodes > self.max_nodes: raise exception.BaseException( "min_nodes cannot be greater than max_nodes") use_default_stats_file = self.dump_stats and not self.stats_file use_default_plots_dir = self.plot_stats and not self.plot_output_dir if use_default_stats_file or use_default_plots_dir: self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) if not self.stats_file: self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag if not self.plot_output_dir: self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) if self.dump_stats: if os.path.isdir(self.stats_file): raise exception.BaseException("stats file destination '%s' is" " a directory" % self.stats_file) sfdir = os.path.dirname(os.path.abspath(self.stats_file)) self._validate_dir(sfdir, msg_prefix="stats file destination") if self.plot_stats: if os.path.isfile(self.plot_output_dir): raise exception.BaseException("plot output destination '%s' " "is a file" % self.plot_output_dir) self._validate_dir(self.plot_output_dir, msg_prefix="plot output destination") raw = dict(__raw__=True) log.info("Starting load balancer (Use ctrl-c to exit)") log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw) log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw) log.info("Cluster growth rate: %d nodes/iteration\n" % self.add_nodes_per_iteration, extra=raw) if self.dump_stats: log.info("Writing stats to file: %s" % self.stats_file) if self.plot_stats: log.info("Plotting stats to directory: %s" % self.plot_output_dir) while(self._keep_polling): if not cluster.is_cluster_up(): log.info("Waiting for all nodes to come up...") time.sleep(self.polling_interval) continue self.get_stats() log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw) log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), extra=raw) oldest_queued_job_age = self.stat.oldest_queued_job_age() if oldest_queued_job_age: log.info("Oldest queued job: %s" % oldest_queued_job_age, extra=raw) log.info("Avg job duration: %d secs" % self.stat.avg_job_duration(), extra=raw) log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), extra=raw) log.info("Last cluster modification time: %s" % self.__last_cluster_mod_time.strftime("%Y-%m-%d %X%z"), extra=dict(__raw__=True)) # evaluate if nodes need to be added self._eval_add_node() # evaluate if nodes need to be removed self._eval_remove_node() if self.dump_stats or self.plot_stats: self.stat.write_stats_to_csv(self.stats_file) # call the visualizer if self.plot_stats: try: self.visualizer.graph_all() except IOError, e: raise exception.BaseException(str(e)) # evaluate if cluster should be terminated if self.kill_cluster: if self._eval_terminate_cluster(): log.info("Terminating cluster and exiting...") return self._cluster.terminate_cluster() log.info("Sleeping...(looping again in %d secs)\n" % self.polling_interval) time.sleep(self.polling_interval)