def _mount_volume(self, node, volume, mount_path): if volume.attach_data.device == None: log.error("Volume %s has not been attached" % volume.id) return device = volume.attach_data.device.replace("sd","xvd") self._mount_device(node, device, mount_path)
def main(self): # Create global options parser. self.gparser = gparser = self.create_global_parser() # Declare subcommands. subcmds = commands.all_cmds # subcommand completions scmap = {} for sc in subcmds: for n in sc.names: scmap[n] = sc if optcomplete: listcter = optcomplete.ListCompleter(scmap.keys()) subcter = optcomplete.NoneCompleter() optcomplete.autocomplete( gparser, listcter, None, subcter, subcommands=scmap) elif 'COMP_LINE' in os.environ: return -1 gopts, sc, opts, args = self.parse_subcommands(gparser, subcmds) if args and args[0] == 'help': sc.parser.print_help() sys.exit(0) try: sc.execute(args) except (EC2ResponseError, S3ResponseError, BotoServerError), e: log.error("%s: %s" % (e.error_code, e.error_message)) sys.exit(1)
def parse_subcommands(self, gparser=None): """ Parse global arguments, find subcommand from list of subcommand objects, parse local subcommand arguments and return a tuple of global options, selected command object, command options, and command arguments. Call execute() on the command object to run. The command object has members 'gopts' and 'opts' set for global and command options respectively, you don't need to call execute with those but you could if you wanted to. """ gparser = gparser or self.gparser # parse global options. gopts, args = gparser.parse_args() if not args: gparser.print_help() raise SystemExit("\nError: you must specify an action.") # set debug level if specified if gopts.DEBUG: console.setLevel(logger.DEBUG) config.DEBUG_CONFIG = True # load StarClusterConfig into global options try: cfg = config.StarClusterConfig(gopts.CONFIG) cfg.load() except exception.ConfigNotFound, e: log.error(e.msg) e.display_options() sys.exit(1)
def __local_mpe_instalation(self): try: log.info('W2IO: local MPE instalation') call(self.__remote_module_path + '/bin/w2io-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-instalation.log') except: log.error('W2IO: local MPE instalation error') sys.exit('W2IO: local MPE instalation error')
def __local_dependency_instalation(self): try: log.info('W2IO: local Dependency instalation') call(self.__remote_module_path + '/bin/w2io-dependency.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-dependency.log') except: log.error('W2IO: local Dependency instalation error') sys.exit('W2IO: local Dependency instalation error')
def __local_fix_fetch(self): try: log.info('W2IO: local Fixing fetch problem') call(self.__remote_module_path + '/bin/fetch-problem.sh 2>&1 > ' + self.__remote_module_path + '/log/fetch-problem.log') except: log.error('W2IO: local Fixing fetch problem error') sys.exit('W2IO: local Fixing fetch problem error')
def __mpe_instalation(self, node): try: log.info('W2IO: ' + node.alias + ' MPE instalation') node.ssh.execute(self.__remote_module_path + '/bin/w2io-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-instalation.log') except: log.error('W2IO: ' + node.alias + ' MPE instalation error') sys.exit('W2IO: ' + node.alias + ' MPE instalation error')
def recover(self, nodes, master, user, user_shell, volumes): cmd = "ps -ef | grep sge_qmaster | grep -v grep | wc -l" rez = int(master.ssh.execute(cmd)[0]) if rez == 0: log.error("sge_qmaster is down") cmd = "cd /opt/sge6/bin/linux-x64/ && ./sge_qmaster" master.ssh.execute(cmd)
def __init__(self, enable_notebook=False, notebook_passwd=None, notebook_directory=None, packer=None, n_engines_per_node=None, n_engines_master=None, hub_db_class='IPython.parallel.controller.dictdb.NoDB', log_level='INFO'): super(ReliableIPCluster, self).__init__() if isinstance(enable_notebook, basestring): self.enable_notebook = enable_notebook.lower().strip() == 'true' else: self.enable_notebook = enable_notebook self.notebook_passwd = notebook_passwd or utils.generate_passwd(16) self.notebook_directory = notebook_directory self.hub_db_class = hub_db_class if n_engines_per_node is None: self.n_engines_per_node = None else: self.n_engines_per_node = int(n_engines_per_node) if n_engines_master is None: self.n_engines_master = None else: self.n_engines_master = int(n_engines_master) self.log_level = log_level if packer not in (None, 'json', 'pickle', 'msgpack'): log.error("Unsupported packer: %s", packer) self.packer = None else: self.packer = packer
def dependency_instalation(self, node): try: log.info('PabPlug: ' + node.alias + ' Dependency instalation') node.ssh.execute(self.__remote_module_path + '/bin/pvfs-dependency.sh 2>&1 > ' + self.__remote_module_path + '/log/pvfs-dependency.log') except Exception as e: log.error('PabPlug: ' + node.alias + ' Dependency instalation error: '+str(e)) sys.exit('PabPlug: ' + node.alias + ' Dependency instalation error: '+str(e))
def create(self, volume_size, volume_zone, name=None, tags=None): try: self.validate(volume_size, volume_zone, self._device) instance = self._request_instance(volume_zone) self._validate_required_progs([self._mkfs_cmd.split()[0]]) self._determine_device() vol = self._create_volume(volume_size, volume_zone) if tags: for tag in tags: tagval = tags.get(tag) tagmsg = "Adding volume tag: %s" % tag if tagval: tagmsg += "=%s" % tagval log.info(tagmsg) vol.add_tag(tag, tagval) if name: vol.add_tag("Name", name) self._attach_volume(self._volume, instance.id, self._device) self._format_volume() self.shutdown() self._warn_about_volume_hosts() self.log.info("Your new %sGB volume %s has been created " "successfully" % (volume_size, vol.id)) return vol except Exception: self.log.error("failed to create new volume") if self._volume: log.error( "Error occured. Detaching, and deleting volume: %s" % \ self._volume.id) self._volume.detach(force=True) time.sleep(5) self._volume.delete() self._warn_about_volume_hosts() raise
def _validate(self, validate_running=False): """ Checks that all cluster template settings are valid. Raises a ClusterValidationError exception if not. Passing validate_running=True will also check that the existing instances properties match the configuration of this cluster template. """ log.info("Validating cluster template settings...") self._has_all_required_settings() self._validate_spot_bid() self._validate_cluster_size() self._validate_shell_setting() self._validate_permission_settings() self._validate_credentials() self._validate_keypair() self._validate_zone() self._validate_ebs_settings() self._validate_ebs_aws_settings() self._validate_image_settings() self._validate_instance_types() if validate_running: log.info("Validating existing instances...") try: self._validate_running_instances() except exception.ClusterValidationError, e: log.error("existing instances are not compatible with cluster" + " template settings:") raise
def main(self): # Create global options parser. self.gparser = gparser = self.create_global_parser() # Declare subcommands. subcmds = all_cmds # subcommand completions scmap = {} for sc in subcmds: for n in sc.names: scmap[n] = sc if optcomplete: listcter = optcomplete.ListCompleter(scmap.keys()) subcter = optcomplete.NoneCompleter() optcomplete.autocomplete( gparser, listcter, None, subcter, subcommands=scmap) elif 'COMP_LINE' in os.environ: return -1 gopts, sc, opts, args = self.parse_subcommands(gparser, subcmds) if args and args[0] == 'help': sc.parser.print_help() sys.exit(0) try: sc.execute(args) except exception.BaseException, e: lines = e.msg.splitlines() for l in lines: log.error(l) sys.exit(1)
def get_value(self, value, node): """ Handle variables [[date]] [[alias]] - node name [[master]] - name of this nodes master [[localuser]] - user name of person that started cluster, according to machine cluster started from """ auto_pattern = r"\[\[(.+)\]\]" auto_v = re.match(auto_pattern, value) if auto_v: command = auto_v.group(1).strip() if command == "date": return datetime.utcnow().strftime("%c UTC") if command == "alias": return node.alias if command == "master": return master.alias if command == "localuser": return getpass.getuser() log.error( ("Tagging: <%s> appears to be a patterned tag, but " "no command found. Tagging as <%s>.") % (value, command) ) return command else: return value
def get_spot_history(self, instance_type, start=None, end=None, plot=False): if not utils.is_iso_time(start): raise exception.InvalidIsoDate(start) if not utils.is_iso_time(end): raise exception.InvalidIsoDate(end) hist = self.conn.get_spot_price_history(start_time=start, end_time=end, instance_type=instance_type, product_description="Linux/UNIX") if not hist: raise exception.SpotHistoryError(start,end) dates = [ utils.iso_to_datetime_tuple(i.timestamp) for i in hist] prices = [ i.price for i in hist ] maximum = max(prices) avg = sum(prices)/len(prices) log.info("Current price: $%.2f" % hist[-1].price) log.info("Max price: $%.2f" % maximum) log.info("Average price: $%.2f" % avg) if plot: try: import pylab pylab.plot_date(pylab.date2num(dates), prices, linestyle='-') pylab.xlabel('date') pylab.ylabel('price (cents)') pylab.title('%s Price vs Date (%s - %s)' % (instance_type, start, end)) pylab.grid(True) pylab.show() except ImportError,e: log.error("Error importing pylab:") log.error(str(e)) log.error("please check that matplotlib is installed and that:") log.error(" $ python -c 'import pylab'") log.error("completes without error")
def list_bucket(self, bucketname): bucket = self.get_bucket_or_none(bucketname) if bucket: for file in bucket.list(): if file.name: print file.name else: log.error('bucket %s does not exist' % bucketname)
def execute(self, command, silent=True, only_printable=False, ignore_exit_status=False, log_output=True, detach=False, source_profile=True, raise_on_failure=True): """ Execute a remote command and return stdout/stderr NOTE: this function blocks until the process finishes kwargs: silent - don't print the command's output to the console only_printable - filter the command's output to allow only printable characters ignore_exit_status - don't warn about non-zero exit status log_output - log all remote output to the debug file detach - detach the remote process so that it continues to run even after the SSH connection closes (does NOT return output or check for non-zero exit status if detach=True) source_profile - if True prefix the command with "source /etc/profile" raise_on_failure - raise exception.SSHError if command fails returns List of output lines """ channel = self.transport.open_session() if detach: command = "nohup %s &" % command if source_profile: command = "source /etc/profile && %s" % command channel.exec_command(command) channel.close() self.__last_status = None return if source_profile: command = "source /etc/profile && %s" % command log.debug("executing remote command: %s" % command) channel.exec_command(command) output = self._get_output(channel, silent=silent, only_printable=only_printable) exit_status = channel.recv_exit_status() self.__last_status = exit_status out_str = '\n'.join(output) if exit_status != 0: msg = "remote command '%s' failed with status %d" msg %= (command, exit_status) if log_output: msg += ":\n%s" % out_str else: msg += " (no output log requested)" if not ignore_exit_status: if raise_on_failure: raise exception.RemoteCommandFailed( msg, command, exit_status, out_str) else: log.error(msg) else: log.debug("(ignored) " + msg) else: if log_output: log.debug("output of '%s':\n%s" % (command, out_str)) else: log.debug("output of '%s' has been hidden" % command) return output
def get_value(self, value, node): """ Handle special values [[date]] [[alias]] - node name [[localuser]] - user name of person that started cluster, according to machine cluster started from """ auto_pattern = r'\[\[(.+)\]\]' auto_v = re.match(auto_pattern, value) if auto_v: special_value = auto_v.group(1).strip() if special_value == 'date': return datetime.utcnow().strftime('%c UTC') if special_value == 'alias': return node.alias # if special_value == 'master': # return master.alias if special_value == 'localuser': return getpass.getuser() log.error(("Tagging: <%s> appears to be a patterned tag, but " "no special_value found. Tagging as <%s>.") % (value, special_value) ) return special_value else: return value
def is_valid(self, size, zone, device): try: self.validate(size, zone, device) return True except exception.BaseException, e: log.error(e.msg) return False
def _completer(self): try: rimages = self.ec2.registered_images completion_list = [i.id for i in rimages] return optcomplete.ListCompleter(completion_list) except Exception, e: log.error('something went wrong fix me: %s' % e)
def is_valid(self, size, zone, device, image): try: self.validate(size, zone, device, image) return True except exception.ValidationError,e: log.error(e.msg) return False
def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = utils.get_utc_now() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def create(self, volume_size, volume_zone, name=None, tags=None): try: self.validate(volume_size, volume_zone, self._aws_block_device) instance = self._request_instance(volume_zone) self._validate_required_progs([self._mkfs_cmd.split()[0]]) self._determine_device() vol = self._create_volume(volume_size, volume_zone) if tags: for tag in tags: tagval = tags.get(tag) tagmsg = "Adding volume tag: %s" % tag if tagval: tagmsg += "=%s" % tagval log.info(tagmsg) vol.add_tag(tag, tagval) if name: vol.add_tag("Name", name) self._attach_volume(self._volume, instance.id, self._aws_block_device) self._get_volume_device(self._aws_block_device) self._format_volume() self.shutdown() log.info("Your new %sGB volume %s has been created successfully" % (volume_size, vol.id)) return vol except Exception: log.error("Failed to create new volume", exc_info=True) self._delete_new_volume() raise finally: self._warn_about_volume_hosts()
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def get_stats(self): """ this function will ssh to the SGE master and get load & queue stats. it will feed these stats to SGEStats, which parses the XML. it will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc """ log.debug("starting get_stats") master = self._cluster.master_node self.stat = SGEStats() qhostXml = "" qstatXml = "" qacct = "" try: now = self.get_remote_time() qatime = self.get_qatime(now) qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml' qhostXml = '\n'.join(master.ssh.execute( \ 'source /etc/profile && qhost -xml', log_output=False)) qstatXml = '\n'.join(master.ssh.execute(qstat_cmd, log_output=False)) qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \ ignore_exit_status=True)) except Exception, e: log.error("Error occured getting SGE stats via ssh. "\ "Cluster terminated?") log.error(e) return -1
def run(self, cluster): """ This is a rough looping function. it will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It will look at the job queue and try to decide whether to add or remove a node. It should later look at job durations. Doesn't yet. """ self._cluster = cluster if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) while(self._keep_polling): if not cluster.is_cluster_up(): log.info("Entire cluster is not up, nodes added/removed. " + \ "No Action.") time.sleep(self.polling_interval) continue if self.get_stats() == -1: log.error("Failed to get stats. LoadBalancer is terminating.") return log.info("Oldest job is from %s. # queued jobs = %d. # hosts = %d." % (self.stat.oldest_queued_job_age(), len(self.stat.get_queued_jobs()), len(self.stat.hosts))) log.info("Avg job duration = %d sec, Avg wait time = %d sec." % (self.stat.avg_job_duration(), self.stat.avg_wait_time())) #evaluate if nodes need to be added self._eval_add_node() #evaluate if nodes need to be removed self._eval_remove_node() #call the visualizer self._call_visualizer() #sleep for the specified number of seconds log.info("Sleeping, looping again in %d seconds.\n" % self.polling_interval) time.sleep(self.polling_interval)
def __fix_fetch(self, node): try: log.info('W2IO: ' + node.alias + ' Fixing fetch problem') node.ssh.execute(self.__remote_module_path + '/bin/fetch-problem.sh 2>&1 > ' + self.__remote_module_path + '/log/fetch-problem.log') except: log.error('W2IO: ' + node.alias + ' Fixing fetch problem error') sys.exit('W2IO: ' + node.alias + ' Fixing fetch problem error')
def orangefs_instalation(self, node): try: log.info('PabPlug: ' + node.alias + ' OrangeFS instalation') node.ssh.execute(self.__remote_module_path + '/bin/pvfs-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/pvfs-instalation.log') except Exception as e: log.error('PabPlug: ' + node.alias + ' OrangeFS instalation error: '+str(e)) sys.exit('PabPlug: ' + node.alias + ' OrangeFS instalation error: '+str(e))
def bug_found(self): log.error("Oops! Looks like you've found a bug in StarCluster") log.error("Debug file written to: %s" % static.DEBUG_FILE) log.error("Look for lines starting with PID: %s" % static.PID) log.error("Please submit this file, minus any private information,") log.error("to [email protected]") sys.exit(1)
def execute(self, args): if len(args) != 1: self.parser.error("please specify a <tag_name> for this cluster") cfg = self.cfg use_experimental = cfg.globals.get('enable_experimental') if self.opts.spot_bid is not None and not use_experimental: raise exception.ExperimentalFeature('Using spot instances') tag = self.tag = args[0] template = self.opts.cluster_template if not template: template = cfg.get_default_cluster_template(tag) log.info("Using default cluster template: %s" % template) cluster_exists = cluster.cluster_exists(tag, cfg) create = not self.opts.no_create if not cluster_exists and not create: raise exception.ClusterDoesNotExist(tag) scluster = cfg.get_cluster_template(template, tag) scluster.update(self.specified_options_dict) validate_running = self.opts.no_create validate_only = self.opts.validate_only try: scluster._validate(validate_running=validate_running) if validate_only: return except exception.ClusterValidationError,e: log.error('settings for cluster template "%s" are not valid:' % template) raise
def set_trace(): log.error("Unable to load PuDB") log.error("Please check that PuDB is installed and working.") log.error("If not, you can install it via: easy_install pudb")
def ipy_shell(local_ns=None): log.error("Unable to load IPython:\n\n%s\n" % e) log.error("Please check that IPython is installed and working.") log.error("If not, you can install it via: easy_install ipython")
def visualizer(self): if not self._visualizer: try: from starcluster.balancers.sge import visualizer except ImportError, e: log.error("Error importing visualizer:") log.error(str(e)) log.error("check that matplotlib and numpy are installed and:") log.error(" $ python -c 'import matplotlib'") log.error(" $ python -c 'import numpy'") log.error("completes without error") raise exception.BaseException( "Failed to load stats visualizer") self._visualizer = visualizer.SGEVisualizer( self.stats_file, self.plot_output_dir)
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return False queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return False total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return False running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add < 1: return False log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add, reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart, placement_group=self._placement_group, spot_bid=self._spot_bid, instance_type=self._instance_type) if num_nodes < len(self._cluster.nodes): self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) else: log.info("No nodes were successfully added.") except ThreadPoolException as tpe: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) log.error("Individual errors follow") for exc in tpe.exceptions: print exc[1] except Exception: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) return True
def _check_ipython_installed(self, node): has_ipy = node.ssh.has_required(['ipython', 'ipcluster']) if not has_ipy: log.error("IPython is not installed...skipping plugin") return has_ipy
def execute(self, command, silent=True, only_printable=False, ignore_exit_status=False, log_output=True, detach=False, source_profile=True, raise_on_failure=True): """ Execute a remote command and return stdout/stderr NOTE: this function blocks until the process finishes kwargs: silent - don't print the command's output to the console only_printable - filter the command's output to allow only printable characters ignore_exit_status - don't warn about non-zero exit status log_output - log all remote output to the debug file detach - detach the remote process so that it continues to run even after the SSH connection closes (does NOT return output or check for non-zero exit status if detach=True) source_profile - if True prefix the command with "source /etc/profile" raise_on_failure - raise exception.SSHError if command fails returns List of output lines """ channel = self.transport.open_session() if detach: command = "nohup %s &" % command if source_profile: command = "source /etc/profile && %s" % command channel.exec_command(command) channel.close() self.__last_status = None return if source_profile: command = "source /etc/profile && %s" % command log.debug("executing remote command: %s" % command) channel.exec_command(command) output = self._get_output(channel, silent=silent, only_printable=only_printable) exit_status = channel.recv_exit_status() self.__last_status = exit_status out_str = '\n'.join(output) if exit_status != 0: msg = "remote command '%s' failed with status %d" msg %= (command, exit_status) if log_output: msg += ":\n%s" % out_str else: msg += " (no output log requested)" if not ignore_exit_status: if raise_on_failure: raise exception.RemoteCommandFailed( msg, command, exit_status, out_str) else: log.error(msg) else: log.debug("(ignored) " + msg) else: if log_output: try: log.debug("output of '%s':\n%s" % (command, out_str)) except: log.debug("Error writing outstring") else: log.debug("output of '%s' has been hidden" % command) return output
def _completer(self): try: completion_list = [v.id for v in self.ec2.get_volumes()] return optcomplete.ListCompleter(completion_list) except Exception, e: log.error('something went wrong fix me: %s' % e)
# Show StarCluster header self.print_header() # Parse subcommand options and args gopts, sc, opts, args = self.parse_subcommands() if args and args[0] == 'help': # make 'help' subcommand act like --help option sc.parser.print_help() sys.exit(0) # run the subcommand and handle exceptions try: sc.execute(args) except (EC2ResponseError, S3ResponseError, BotoServerError), e: log.error("%s: %s" % (e.error_code, e.error_message)) sys.exit(1) except socket.error, e: log.error("Unable to connect: %s" % e) log.error("Check your internet connection?") sys.exit(1) except exception.ThreadPoolException, e: if not gopts.DEBUG: e.print_excs() log.debug(e.format_excs()) print self.bug_found() except exception.ClusterDoesNotExist, e: cm = gopts.CONFIG.get_cluster_manager() cls = cm.get_clusters() log.error(e.msg) if cls: taglist = ', '.join([c.cluster_tag for c in cls]) active_clusters = "(active clusters: %s)" % taglist
# Parse subcommand options and args gopts, sc, opts, args = self.parse_subcommands() if args and args[0] == 'help': # make 'help' subcommand act like --help option sc.parser.print_help() sys.exit(0) # run the subcommand and handle exceptions try: sc.execute(args) except (EC2ResponseError, S3ResponseError, BotoServerError), e: log.error("%s: %s" % (e.error_code, e.error_message), exc_info=True) sys.exit(1) except socket.error, e: log.exception("Connection error:") log.error("Check your internet connection?") sys.exit(1) except exception.ThreadPoolException, e: log.error(e.format_excs()) self.bug_found() except exception.ClusterDoesNotExist, e: cm = gopts.CONFIG.get_cluster_manager() cls = '' try: cls = cm.get_clusters(load_plugins=False, load_receipt=False) except: log.debug("Error fetching cluster list", exc_info=True) log.error(e.msg) if cls: taglist = ', '.join([c.cluster_tag for c in cls]) active_clusters = "(active clusters: %s)" % taglist
missing_args = [] for arg in args: if arg in plugin: config_args.append(plugin.get(arg)) else: missing_args.append(arg) if debug: log.debug("config_args = %s" % config_args) if missing_args: raise exception.PluginError( "Not enough settings provided for plugin %s (missing: %s)" % (plugin_name, ', '.join(missing_args))) config_kwargs = {} for arg in kwargs: if arg in plugin: config_kwargs[arg] = plugin.get(arg) if debug: log.debug("config_kwargs = %s" % config_kwargs) try: plug_obj = klass(*config_args, **config_kwargs) except Exception as exc: log.error("Error occured:", exc_info=True) raise exception.PluginLoadError( "Failed to load plugin %s with " "the following error: %s - %s" % (setup_class, exc.__class__.__name__, exc.message)) if not hasattr(plug_obj, '__name__'): setattr(plug_obj, '__name__', plugin_name) plugs.append(plug_obj) return plugs
class CmdShell(CmdBase): """ shell Load an interactive IPython shell configured for starcluster development The following objects are automatically available at the prompt: cfg - starcluster.config.StarClusterConfig instance cm - starcluster.cluster.ClusterManager instance ec2 - starcluster.awsutils.EasyEC2 instance s3 - starcluster.awsutils.EasyS3 instance All StarCluster modules are automatically imported in the IPython session along with all StarCluster dependencies (e.g. boto, ssh, etc.) If the --ipcluster=CLUSTER (-p) is passed, the IPython session will be automatically be configured to connect to the remote CLUSTER using IPython's parallel interface (requires IPython 0.11+). In this mode you will have the following additional objects available at the prompt: ipcluster - starcluster.cluster.Cluster instance for the cluster ipclient - IPython.parallel.Client instance for the cluster ipview - IPython.parallel.client.view.DirectView for the cluster Here's an example of how to run a parallel map across all nodes in the cluster: [~]> ipclient.ids [0, 1, 2, 3] [~]> res = ipview.map_async(lambda x: x**30, range(8)) [~]> print res.get() [0, 1, 1073741824, 205891132094649L, 1152921504606846976L, 931322574615478515625L, 221073919720733357899776L, 22539340290692258087863249L] See IPython parallel docs for more details (http://ipython.org/ipython-doc/stable/parallel) """ names = ['shell', 'sh'] def _add_to_known_hosts(self, node): log.info("Configuring local known_hosts file") user_home = os.path.expanduser('~') khosts = os.path.join(user_home, '.ssh', 'known_hosts') if not os.path.isfile(khosts): log.warn("Unable to configure known_hosts: file does not exist") return contents = open(khosts).read() if node.dns_name not in contents: server_pkey = node.ssh.get_server_public_key() khostsf = open(khosts, 'a') if contents[-1] != '\n': khostsf.write('\n') name_entry = '%s,%s' % (node.dns_name, node.ip_address) khostsf.write(' '.join([ name_entry, server_pkey.get_name(), base64.b64encode(str(server_pkey)), '\n' ])) khostsf.close() def addopts(self, parser): parser.add_option("-p", "--ipcluster", dest="ipcluster", action="store", type="string", default=None, metavar="CLUSTER", help="configure a parallel " "IPython session on CLUSTER") def execute(self, args): local_ns = dict(cfg=self.cfg, ec2=self.ec2, s3=self.s3, cm=self.cm, starcluster=starcluster, log=log) if self.opts.ipcluster: log.info("Loading parallel IPython library") try: from IPython.parallel import Client except ImportError, e: self.parser.error( "Error loading parallel IPython:" "\n\n%s\n\n" "NOTE: IPython 0.11+ must be installed to use -p" % e) tag = self.opts.ipcluster cl = self.cm.get_cluster(tag) region = cl.master_node.region.name ipcluster_dir = os.path.join(static.STARCLUSTER_CFG_DIR, 'ipcluster') local_json = os.path.join(ipcluster_dir, "%s-%s.json" % (tag, region)) if not os.path.exists(local_json): user_home = cl.master_node.getpwnam(cl.cluster_user).pw_dir profile_dir = posixpath.join(user_home, '.ipython', 'profile_default') json = posixpath.join(profile_dir, 'security', 'ipcontroller-client.json') if cl.master_node.ssh.isfile(json): log.info("Fetching connector file from cluster...") if not os.path.exists(ipcluster_dir): os.makedirs(ipcluster_dir) cl.master_node.ssh.get(json, local_json) else: self.parser.error( "IPython json file %s does not exist locally or on " "the cluster. Make sure the ipcluster plugin has " "been executed and completed successfully.") key_location = cl.master_node.key_location self._add_to_known_hosts(cl.master_node) log.info("Loading parallel IPython client and view") rc = Client(local_json, sshkey=key_location) local_ns['Client'] = Client local_ns['ipcluster'] = cl local_ns['ipclient'] = rc local_ns['ipview'] = rc[:] modules = [(starcluster.__name__ + '.' + module, module) for module in starcluster.__all__] modules += [('boto', 'boto'), ('paramiko', 'paramiko'), ('workerpool', 'workerpool'), ('jinja2', 'jinja2'), ('Crypto', 'Crypto'), ('iptools', 'iptools')] for fullname, modname in modules: log.info('Importing module %s' % modname) try: __import__(fullname) local_ns[modname] = sys.modules[fullname] except ImportError, e: log.error("Error loading module %s: %s" % (modname, e))
def parse_subcommands(self, gparser, subcmds): """ Parse given global arguments, find subcommand from given list of subcommand objects, parse local arguments and return a tuple of global options, selected command object, command options, and command arguments. Call execute() on the command object to run. The command object has members 'gopts' and 'opts' set for global and command options respectively, you don't need to call execute with those but you could if you wanted to. """ print self.get_description() # Build map of name -> command and docstring. cmds_header = 'Available Commands:' gparser.usage += '\n\n%s\n' % cmds_header gparser.usage += '%s\n' % ('-' * len(cmds_header)) gparser.usage += "NOTE: Pass --help to any command for a list of its " gparser.usage += 'options and detailed usage information\n\n' for sc in subcmds: helptxt = sc.__doc__.splitlines()[3].strip() gparser.usage += '- %s: %s\n' % (', '.join(sc.names), helptxt) for n in sc.names: assert n not in self.subcmds_map self.subcmds_map[n] = sc # Declare and parse global options. gparser.disable_interspersed_args() gopts, args = gparser.parse_args() if not args: gparser.print_help() raise SystemExit("\nError: you must specify an action.") subcmdname, subargs = args[0], args[1:] #### CHANGED """ CHANGE LOG FILE TO USER-SUPPLIED LOCATION IF PROVIDED """ if gopts.LOGFILE: static.DEBUG_FILE = gopts.LOGFILE #### REM: THIS REMOVES starcluster.logger.ConsoleLogger HANDLER while len(log.handlers) > 0: log.removeHandler(log.handlers[0]) logger.configure_sc_logging() # set debug level if specified if gopts.DEBUG: console.setLevel(logger.DEBUG) #### CHANGED """ ADDED config_file TO utils FOR USE BY commands.start.addopts """ utils.config_file = gopts.CONFIG # load StarClusterConfig into global options try: cfg = config.StarClusterConfig(gopts.CONFIG) cfg.load() except exception.ConfigNotFound, e: log.error(e.msg) e.display_options() sys.exit(1)
def _setup_ebs_volumes(self): """ Mount EBS volumes, if specified in ~/.starcluster/config to /home """ # setup /etc/fstab on master to use block device if specified master = self._master devs = master.ssh.ls('/dev') for vol in self._volumes: vol = self._volumes[vol] vol_id = vol.get("volume_id") mount_path = vol.get('mount_path') device = vol.get("device") volume_partition = vol.get('partition') if not (vol_id and device and mount_path): log.error("missing required settings for vol %s" % vol) continue dev_exists = master.ssh.path_exists(device) if not dev_exists and device.startswith('/dev/sd'): # check for "correct" device in unpatched kernels device = device.replace('/dev/sd', '/dev/xvd') dev_exists = master.ssh.path_exists(device) if not dev_exists: log.warn("Cannot find device %s for volume %s" % (device, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This usually means there was a problem " "attaching the EBS volume to the master node") continue if not volume_partition: partitions = filter(lambda x: x.startswith(device), devs) if len(partitions) == 1: volume_partition = device elif len(partitions) == 2: volume_partition = device + '1' else: log.error( "volume has more than one partition, please specify " "which partition to use (e.g. partition=0, " "partition=1, etc.) in the volume's config") continue elif not master.ssh.path_exists(volume_partition): log.warn("Cannot find partition %s on volume %s" % (volume_partition, vol_id)) log.warn("Not mounting %s on %s" % (vol_id, mount_path)) log.warn("This either means that the volume has not " "been partitioned or that the partition" "specified does not exist on the volume") continue log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path)) mount_map = self._master.get_mount_map() dev = mount_map.get(volume_partition) if dev: path, fstype, options = dev if path != mount_path: log.error("Volume %s is mounted on %s, not on %s" % (vol_id, path, mount_path)) else: log.info("Volume %s already mounted on %s...skipping" % (vol_id, mount_path)) continue self._master.mount_device(volume_partition, mount_path)
def execute(self, args): if len(args) != 1: self.parser.error("please specify a <cluster_tag>") tag = self.tag = args[0] create = not self.opts.no_create create_only = self.opts.create_only cluster_exists = self.cm.get_cluster_or_none(tag) validate = self.opts.validate validate_running = self.opts.no_create validate_only = self.opts.validate_only if cluster_exists and create: stopped_ebs = cluster_exists.is_cluster_stopped() is_ebs = False if not stopped_ebs: is_ebs = cluster_exists.is_ebs_cluster() raise exception.ClusterExists(tag, is_ebs=is_ebs, stopped_ebs=stopped_ebs) if not cluster_exists and not create: raise exception.ClusterDoesNotExist(tag) scluster = None if cluster_exists: validate_running = True scluster = self.cm.get_cluster(tag) log.info( "Using original template used to launch cluster '%s'" % \ scluster.cluster_tag) else: template = self.opts.cluster_template if not template: template = self.cm.get_default_cluster_template() log.info("Using default cluster template: %s" % template) scluster = self.cm.get_cluster_template(template, tag) scluster.update(self.specified_options_dict) if not self.opts.refresh_interval: interval = self.cfg.globals.get("refresh_interval") scluster.refresh_interval = interval if validate: try: scluster._validate(validate_running=validate_running) except exception.ClusterValidationError: if not cluster_exists: log.error( 'settings for cluster template "%s" are not valid:' % \ template) raise else: log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK") if validate_only: return if self.opts.spot_bid is not None and not self.opts.no_create: cmd = ' '.join(sys.argv[1:]) cmd = cmd.replace('--no-create', '').replace('-x', '') cmd += ' -x' msg = user_msgs.spotmsg % { 'cmd': cmd, 'size': scluster.cluster_size, 'tag': tag } self.warn_experimental(msg, num_secs=5) self.catch_ctrl_c() scluster.start(create=create, create_only=create_only, validate=False) if self.opts.login_master: scluster.ssh_to_master()
def _call_visualizer(self): if not self._visualizer_on: return try: from starcluster.balancers.sge import visualizer except ImportError, e: log.error("Error importing matplotlib and numpy:") log.error(str(e)) log.error("check that matplotlib and numpy are installed and:") log.error(" $ python -c 'import matplotlib'") log.error(" $ python -c 'import numpy'") log.error("completes without error") log.error("Visualizer has been disabled.") #turn the visualizer off, but keep going. self._visualizer_on = False return
def clean_cluster(self, nodes, master, user, user_shell, volumes): """ Run qhost to find nodes that are present in OGS but not in the cluster in order to remove them. """ self._master = master self._nodes = nodes qhost_xml = master.ssh.execute("qhost -xml", source_profile=True) qhost_et = ET.fromstringlist(qhost_xml) qhosts = [] for host in qhost_et: h_name = host.attrib['name'] if h_name != 'global': qhosts.append(h_name) if len(qhosts) == 0: log.info("Nothing to clean") alive_nodes = [node.alias for node in nodes] cleaned = [] # find dead hosts for node_alias in qhosts: if node_alias not in alive_nodes: cleaned.append(node_alias) # find jobs running in dead hosts qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml", source_profile=True) qstats_xml[1:] # remove first line qstats_et = ET.fromstringlist(qstats_xml) to_delete = [] to_repair = [] cleaned_queue = [] # not a lambda function to allow pickling for c in cleaned: cleaned_queue.append("all.q@" + c) for job_list in qstats_et.find("queue_info").findall("job_list"): if job_list.find("queue_name").text in cleaned_queue: job_number = job_list.find("JB_job_number").text to_delete.append(job_number) for job_list in qstats_et.find("job_info").findall("job_list"): if job_list.find("state").text == "Eqw": job_number = job_list.find("JB_job_number").text to_repair.append(job_number) # delete the jobs if to_delete: log.info("Stopping jobs: " + str(to_delete)) self._master.ssh.execute("qdel -f " + " ".join(to_delete)) time.sleep(3) # otherwise might provoke LOST QRSH if on last job if to_repair: log.error("Reseting jobs: " + str(to_repair)) self._master.ssh.execute("qmod -cj " + " ".join(to_repair), ignore_exit_status=True) # stuck qrsh issue ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0]) qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0]) if qstat_wc == 0 and ps_wc > 2: log.error("LOST QRSH??") log.error("pkill -9 qrsh") self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True) # ---------------------------------- # delete the host config for c in cleaned: log.info("Cleaning node " + c) if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0: log.warn(c + " is missing from /etc/hosts, creating a dummy " "entry 1.1.1.1") rfile = master.ssh.remote_file("/etc/hosts", 'a') rfile.write("1.1.1.1 " + c + "\n") rfile.close() try: self._remove_from_sge(DeadNode(c), only_clean_master=True) except RemoteCommandFailed: log.warning("Failed to remove node {} from sge." .format(c), exc_info=True) # fix to allow pickling self._master = None self._nodes = None