def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def __init__(self, num_users=None, usernames=None, download_keys=None, download_keys_dir=None): if usernames: usernames = [user.strip() for user in usernames.split(',')] if num_users: try: num_users = int(num_users) except ValueError: raise exception.BaseException("num_users must be an integer") elif usernames: num_users = len(usernames) else: raise exception.BaseException( "you must provide num_users or usernames or both") if usernames and num_users and len(usernames) != num_users: raise exception.BaseException( "only %d usernames provided - %d required" % (len(usernames), num_users)) self._num_users = num_users if not usernames: usernames = ['user%.3d' % i for i in range(1, num_users + 1)] self._usernames = usernames self._download_keys = str(download_keys).lower() == "true" self._download_keys_dir = download_keys_dir or self.DOWNLOAD_KEYS_DIR super(CreateUsers, self).__init__()
def add_user_to_group(self, user, group): """ Add user (if exists) to group (if exists) """ if not user in self.get_user_map(): raise exception.BaseException("user %s does not exist" % user) if group in self.get_group_map(): self.ssh.execute('gpasswd -a %s %s' % (user, 'utmp')) else: raise exception.BaseException("group %s does not exist" % group)
def _mkdir(self, directory, makedirs=False): if not os.path.isdir(directory): if os.path.isfile(directory): raise exception.BaseException("'%s' is a file not a directory") try: if makedirs: os.makedirs(directory) log.info("Created directories %s" % directory) else: os.mkdir(directory) log.info("Created single directory %s" % directory) except IOError, e: raise exception.BaseException(str(e))
def put(self, node, rpath, lpaths): if self.opts.user: node.ssh.switch_user(self.opts.user) if len(lpaths) > 1 and not node.ssh.isdir(rpath): msg = "Remote path in %s does not exist: %s" % (node, rpath) raise exception.BaseException(msg) node.ssh.put(lpaths, rpath)
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: user_data = self._get_user_data(tries=5) aliases = user_data.split('|') index = self.ami_launch_index try: alias = aliases[index] except IndexError: log.debug("invalid user_data: %s (index: %d)" % (aliases, index)) alias = None if not alias: raise exception.BaseException("instance %s has no alias" % self.id) self.add_tag('alias', alias) name = self.tags.get('Name') if not name: self.add_tag('Name', alias) self._alias = alias return self._alias
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag, load_receipt=False) try: node = cl.get_node(self.opts.node) except exception.InstanceDoesNotExist as ide: if self.opts.node == "master": #may have happened because master node is clustername-master #i.e. dns_prefix = True in config #lets check try: node = cl.get_node('%s-%s' % (ctag, self.opts.node)) except exception.InstanceDoesNotExist as ide2: #k, master is just not there, raise original error log.debug("Neither master nor %s-%s exist." % (ctag, self.opts.node)) raise (ide) else: #node name was provided raise if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def download_image_files(self, image_id, destdir): """ Downloads the manifest.xml and all AMI parts for image_id to destdir """ if not os.path.isdir(destdir): raise exception.BaseException( "destination directory '%s' does not exist" % destdir) widgets = [ 'file: ', progressbar.Percentage(), ' ', progressbar.Bar(marker=progressbar.RotatingMarker()), ' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed() ] files = self.get_image_files(image_id) def _dl_progress_cb(trans, total): pbar.update(trans) log.info("Downloading image: %s" % image_id) for file in files: widgets[0] = "%s:" % file.name pbar = progressbar.ProgressBar(widgets=widgets, maxval=file.size).start() file.get_contents_to_filename(os.path.join(destdir, file.name), cb=_dl_progress_cb) pbar.finish()
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: aliasestxt = self.user_data.get(static.UD_ALIASES_FNAME) aliases = aliasestxt.splitlines()[2:] index = self.ami_launch_index try: alias = aliases[index] except IndexError: alias = None log.debug("invalid aliases file in user_data:\n%s" % aliasestxt) if not alias: raise exception.BaseException( "instance %s has no alias" % self.id) self.add_tag('alias', alias) if not self.tags.get('Name'): self.add_tag('Name', alias) self._alias = alias return self._alias
def get(self, remotepaths, localpath=''): """ Copies one or more files from the remote host to the local host. """ remotepaths = self._make_list(remotepaths) localpath = localpath or os.getcwd() globs = [] noglobs = [] for rpath in remotepaths: if glob.has_magic(rpath): globs.append(rpath) else: noglobs.append(rpath) globresults = [self.glob(g) for g in globs] remotepaths = noglobs for globresult in globresults: remotepaths.extend(globresult) recursive = False for rpath in remotepaths: if not self.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) for rpath in remotepaths: if self.isdir(rpath): recursive = True break try: self.scp.get(remotepaths, local_path=localpath, recursive=recursive) except Exception, e: log.debug("get failed: remotepaths=%s, localpath=%s", str(remotepaths), localpath) raise exception.SCPException(str(e))
def _validate_dir(self, dirname, msg_prefix=""): if not os.path.isdir(dirname): msg = "'%s' is not a directory" if not os.path.exists(dirname): msg = "'%s' does not exist" if msg_prefix: msg = ' '.join([msg_prefix, msg]) msg = msg % dirname raise exception.BaseException(msg)
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, local files or " + "directories, and a remote destination path") ctag = args[0] rpath = args[-1] lpaths = args[1:-1] for lpath in lpaths: if not os.path.exists(lpath): raise exception.BaseException( "Local file or directory does not exist: %s" % lpath) cl = self.cm.get_cluster(ctag, load_receipt=False) node = cl.get_node(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) if len(lpaths) > 1 and not node.ssh.isdir(rpath): raise exception.BaseException("Remote path does not exist: %s" % rpath) node.ssh.put(lpaths, rpath)
def _get_type_from_fp(fp): line = fp.readline() fp.seek(0) # slist is sorted longest first slist = starts_with_mappings.keys() slist.sort(key=lambda e: -1 * len(e)) for sstr in slist: if line.startswith(sstr): return starts_with_mappings[sstr] raise exception.BaseException("invalid user data type: %s" % line)
def _get_volume_device(self, device=None): dev = device or self._aws_block_device inst = self._instance if inst.ssh.path_exists(dev): self._real_device = dev return dev xvdev = '/dev/xvd' + dev[-1] if inst.ssh.path_exists(xvdev): self._real_device = xvdev return xvdev raise exception.BaseException("Can't find volume device")
def user_data(self): if not self._user_data: try: raw = self._get_user_data() self._user_data = userdata.unbundle_userdata(raw) except IOError, e: parent_cluster = self.parent_cluster if self.parent_cluster: raise exception.IncompatibleCluster(parent_cluster) else: raise exception.BaseException( "Error occurred unbundling userdata: %s" % e)
def execute(self, args): if not self.cfg.globals.enable_experimental: raise exception.ExperimentalFeature("The 'put' command") if len(args) < 3: self.parser.error("please specify a cluster, local files or " + "directories, and a remote destination path") ctag = args[0] rpath = args[-1] lpaths = args[1:-1] for lpath in lpaths: if not os.path.exists(lpath): raise exception.BaseException( "Local file or directory does not exist: %s" % lpath) cl = self.cm.get_cluster(ctag) node = cl.get_node_by_alias(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) if len(lpaths) > 1 and not node.ssh.isdir(rpath): raise exception.BaseException("Remote path does not exist: %s" % rpath) node.ssh.put(lpaths, rpath)
def write_stats_to_csv(self, filename): """ Write important SGE stats to CSV file Appends one line to the CSV """ bits = self.get_all_stats() try: f = open(filename, 'a') flat = ','.join(str(n) for n in bits) + '\n' f.write(flat) f.close() except IOError, e: raise exception.BaseException(str(e))
def visualizer(self): if not self._visualizer: try: from starcluster.balancers.sge import visualizer except ImportError, e: log.error("Error importing visualizer:") log.error(str(e)) log.error("check that matplotlib and numpy are installed and:") log.error(" $ python -c 'import matplotlib'") log.error(" $ python -c 'import numpy'") log.error("completes without error") raise exception.BaseException( "Failed to load stats visualizer") self._visualizer = visualizer.SGEVisualizer( self.stats_file, self.plot_output_dir)
def create_keypair(self, name, output_file=None): """ Create a new EC2 keypair and optionally save to output_file Returns boto.ec2.keypair.KeyPair """ if output_file: output_dir = os.path.dirname(output_file) if output_dir and not os.path.exists(output_dir): raise exception.BaseException( "output directory does not exist") if os.path.exists(output_file): raise exception.BaseException( "cannot save keypair %s: file already exists" % \ output_file) kp = self.conn.create_key_pair(name) if output_file: try: kfile = open(output_file, 'wb') kfile.write(kp.material) kfile.close() os.chmod(output_file, 0400) except IOError, e: raise exception.BaseException(str(e))
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag, load_receipt=False) node = cl.get_node(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def open_browser(url, browser_cmd=None): if browser_cmd: cmd = shlex.split(browser_cmd) arg0 = cmd[0] if not _which(arg0): raise exception.BaseException("browser %s does not exist" % arg0) if "%s" not in browser_cmd: cmd.append("%s") browser = BackgroundBrowser(cmd) else: # use 'default' browser from webbrowser module browser = webbrowser.get() browser_name = getattr(browser, 'name', None) if not browser_name: browser_name = getattr(browser, '_name', 'UNKNOWN') log.info("Browsing %s using '%s'..." % (url, browser_name)) return browser.open(url)
def alias(self): """ Return the alias stored in this node's user data. Alias returned as: user_data.split('|')[self.ami_launch_index] """ if not self._alias: user_data = self.ec2.get_instance_user_data(self.id) aliases = user_data.split('|') index = self.ami_launch_index alias = aliases[index] if not alias: # TODO: raise exception about old version raise exception.BaseException("instance %s has no alias" % alias) self._alias = alias return self._alias
def slots_per_host(self): """ Returns the number of slots per host. If for some reason the cluster is inconsistent, this will return -1 for example, if you have m1.large and m1.small in the same cluster """ total = self.count_total_slots() if total == 0: return total single = 0 for q in self.queues: if q.startswith('all.q@'): single = self.queues.get(q).get('slots') break if (total != (single * len(self.hosts))): raise exception.BaseException( "ERROR: Number of slots not consistent across cluster") return single
def execute(self, args): if not self.cfg.globals.enable_experimental: raise exception.ExperimentalFeature("The 'get' command") if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag) node = cl.get_node_by_alias(self.opts.node) if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: alias = self.get_aliases(self.ami_launch_index) if not alias: raise exception.BaseException( "instance %s has no alias" % self.id) self.add_tag('alias', alias) if not self.tags.get('Name'): self.add_tag('Name', alias) self._alias = alias return self._alias
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: user_data = self.ec2.get_instance_user_data(self.id) aliases = user_data.split('|') index = self.ami_launch_index alias = aliases[index] if not alias: # TODO: raise exception about old version raise exception.BaseException("instance %s has no alias" % self.id) self.add_tag('alias', alias) self._alias = alias return self._alias
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, local files or " + "directories, and a remote destination path") ctag = args[0] rpath = args[-1] lpaths = args[1:-1] for lpath in lpaths: if not os.path.exists(lpath): raise exception.BaseException( "Local file or directory does not exist: %s" % lpath) cl = self.cm.get_cluster(ctag, load_receipt=False) if self.opts.multi: nodes = [ cl.get_node_by_alias(nodename) for nodename in list_csv(self.opts.multi) ] else: nodes = [cl.get_node_by_alias(self.opts.node)] for node in nodes: self.put(node, rpath, lpaths)
def _setup_cluster_user(self, user=None): """ Create cluster user on all StarCluster nodes This command takes care to examine existing folders in /home and set the new cluster_user's uid/gid accordingly. This is necessary for the case of EBS volumes containing /home with large amounts of data in them. It's much less expensive in this case to set the uid/gid of the new user to be the existing uid/gid of the dir in EBS rather than chowning potentially terabytes of data. """ user = user or self._user uid, gid = self._get_new_user_id(user) if uid == 0 or gid == 0: raise exception.BaseException( "Cannot create user: {0:s} (uid: {1:1d}, gid: {2:1d}). This " "is caused by /home/{0:s} directory being owned by root. To " "fix this you'll need to create a new AMI. Note that the " "instance is still up.".format(user, uid, gid)) log.info("Creating cluster user: %s (uid: %d, gid: %d)" % (user, uid, gid)) self._add_user_to_nodes(uid, gid, self._nodes)
def get_stats(self): """ This method will ssh to the SGE master and get load & queue stats. It will feed these stats to SGEStats, which parses the XML. It will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc. """ log.debug("starting get_stats") retries = 5 for i in range(retries): try: return self._get_stats() except Exception: log.warn("Failed to retrieve stats (%d/%d):" % (i + 1, retries), exc_info=True) log.warn("Retrying in %ds" % self.polling_interval) time.sleep(self.polling_interval) raise exception.BaseException( "Failed to retrieve SGE stats after trying %d times, exiting..." % retries)
def run(self, cluster): """ This function will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It looks at the job queue and tries to decide whether to add or remove a node. It should later look at job durations (currently doesn't) """ self._cluster = cluster if self.max_nodes is None: self.max_nodes = cluster.cluster_size if self.min_nodes is None: self.min_nodes = 1 if self.kill_cluster: self.min_nodes = 0 if self.min_nodes > self.max_nodes: raise exception.BaseException( "min_nodes cannot be greater than max_nodes") use_default_stats_file = self.dump_stats and not self.stats_file use_default_plots_dir = self.plot_stats and not self.plot_output_dir if use_default_stats_file or use_default_plots_dir: self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) if not self.stats_file: self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag if not self.plot_output_dir: self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) if self.dump_stats: if os.path.isdir(self.stats_file): raise exception.BaseException("stats file destination '%s' is" " a directory" % self.stats_file) sfdir = os.path.dirname(os.path.abspath(self.stats_file)) self._validate_dir(sfdir, msg_prefix="stats file destination") if self.plot_stats: if os.path.isfile(self.plot_output_dir): raise exception.BaseException("plot output destination '%s' " "is a file" % self.plot_output_dir) self._validate_dir(self.plot_output_dir, msg_prefix="plot output destination") raw = dict(__raw__=True) log.info("Starting load balancer (Use ctrl-c to exit)") log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw) log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw) log.info("Cluster growth rate: %d nodes/iteration\n" % self.add_nodes_per_iteration, extra=raw) if self.dump_stats: log.info("Writing stats to file: %s" % self.stats_file) if self.plot_stats: log.info("Plotting stats to directory: %s" % self.plot_output_dir) while (self._keep_polling): cluster.recover(reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart) cluster.clean() if not cluster.is_cluster_up(): log.info("Waiting for all nodes to come up...") time.sleep(self.polling_interval) continue self.get_stats() log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw) log.info("Execution slots: %d" % self.stat.count_total_slots(), extra=raw) log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), extra=raw) oldest_queued_job_age = self.stat.oldest_queued_job_age() if oldest_queued_job_age: log.info("Oldest queued job: %s" % oldest_queued_job_age, extra=raw) log.info("Avg job duration: %d secs" % self.stat.avg_job_duration(), extra=raw) log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), extra=raw) log.info("Last cluster modification time: %s" % self.__last_cluster_mod_time.isoformat(), extra=dict(__raw__=True)) # evaluate if nodes need to be added skip_sleep = self._eval_add_node() # evaluate if nodes need to be removed self._eval_remove_node() if self.dump_stats or self.plot_stats: self.stat.write_stats_to_csv(self.stats_file) # call the visualizer if self.plot_stats: try: self.visualizer.graph_all() except IOError, e: raise exception.BaseException(str(e)) # evaluate if cluster should be terminated if self.kill_cluster: if self._eval_terminate_cluster(): log.info("Terminating cluster and exiting...") return self._cluster.terminate_cluster() if not skip_sleep: log.info("Sleeping...(looping again in %d secs)\n" % self.polling_interval) time.sleep(self.polling_interval)